From bd7b7ce96db4487bb77692a85ee4489fd2c395df Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Wed, 22 Apr 2026 12:06:36 -0700 Subject: nvme-auth: Hash DH shared secret to create session key The NVMe Base Specification 8.3.5.5.9 states that the session key Ks shall be computed from the ephemeral DH key by applying the hash function selected by the HashID parameter. The current implementation stores the raw DH shared secret as the session key without hashing it. This causes redundant hash operations: 1. Augmented challenge computation (section 8.3.5.5.4) requires Ca = HMAC(H(g^xy mod p), C). The code compensates by hashing the unhashed session key in nvme_auth_augmented_challenge() to produce the correct result. 2. PSK generation (section 8.3.5.5.9) requires PSK = HMAC(Ks, C1 || C2) where Ks should already be H(g^xy mod p). As the DH shared secret is always larger than the HMAC block size, HMAC internally hashes it before use, accidentally producing the correct result. When using secure channel concatenation with bidirectional authentication, this results in hashing the DH value three times: twice for augmented challenge calculations and once during PSK generation. Fix this by: - Modifying nvme_auth_gen_shared_secret() to hash the DH shared secret once after computation: Ks = H(g^xy mod p) - Removing the hash operation from nvme_auth_augmented_challenge() as the session key is now already hashed - Updating session key buffer size from DH key size to hash output size - Adding specification references in comments This avoid storing the raw DH shared secret and reduces the number of hash operations from three to one when using secure channel concatenation. Reviewed-by: Hannes Reinecke Reviewed-by: Eric Biggers Signed-off-by: Chris Leech Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 184a1f9510fa..89902ae8b929 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -49,9 +49,9 @@ int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, u8 *host_key, size_t host_key_len); -int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - const u8 *ctrl_key, size_t ctrl_key_len, - u8 *sess_key, size_t sess_key_len); +int nvme_auth_gen_session_key(struct crypto_kpp *dh_tfm, + const u8 *public_key, size_t public_key_len, + u8 *sess_key, size_t sess_key_len, u8 hash_id); int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *c1, const u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len); -- cgit v1.2.3 From 0b13173d27fa15679463b62a10cfa8b3d6c3a71c Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 15 Apr 2026 13:41:01 +0800 Subject: dma-buf: fix stale @lock references in struct dma_buf documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel-doc comments for vmapping_counter and vmap_ptr in struct dma_buf reference "@lock" as the protecting lock, but struct dma_buf no longer has a "lock" member. The mutex was removed in favor of using the dma_resv lock exclusively. The implementation correctly uses dma_resv_assert_held(dmabuf->resv) in dma_buf_vmap() and dma_buf_vunmap(), so update the documentation to reference @resv instead. Signed-off-by: gaoxiang17 Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260415054101.535520-1-gxxa03070307@gmail.com --- include/linux/dma-buf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 133b9e637b55..ef6d93fd7a2c 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -322,13 +322,13 @@ struct dma_buf { * @vmapping_counter: * * Used internally to refcnt the vmaps returned by dma_buf_vmap(). - * Protected by @lock. + * Protected by @resv. */ unsigned vmapping_counter; /** * @vmap_ptr: - * The current vmap ptr if @vmapping_counter > 0. Protected by @lock. + * The current vmap ptr if @vmapping_counter > 0. Protected by @resv. */ struct iosys_map vmap_ptr; -- cgit v1.2.3 From 619eab23e1ce7c97e54bfc5a417306d94b3f6f13 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 21 Apr 2026 11:21:50 +0100 Subject: mm/vma: do not try to unmap a VMA if mmap_prepare() invoked from mmap() The mmap_prepare hook functionality includes the ability to invoke mmap_prepare() from the mmap() hook of existing 'stacked' drivers, that is ones which are capable of calling the mmap hooks of other drivers/file systems (e.g. overlayfs, shm). As part of the mmap_prepare action functionality, we deal with errors by unmapping the VMA should one arise. This works in the usual mmap_prepare case, as we invoke this action at the last moment, when the VMA is established in the maple tree. However, the mmap() hook passes a not-fully-established VMA pointer to the caller (which is the motivation behind the mmap_prepare() work), which is detached. So attempting to unmap a VMA in this state will be problematic, with the most obvious symptom being a warning in vma_mark_detached(), because the VMA is already detached. It's also unncessary - the mmap() handler will clean up the VMA on error. So to fix this issue, this patch propagates whether or not an mmap action is being completed via the compatibility layer or directly. If the former, then we do not attempt VMA cleanup, if the latter, then we do. This patch also updates the userland VMA tests to reflect the change. Link: https://lore.kernel.org/20260421102150.189982-1-ljs@kernel.org Fixes: ac0a3fc9c07d ("mm: add ability to take further action in vm_area_desc") Signed-off-by: Lorenzo Stoakes Reported-by: syzbot+db390288d141a1dccf96@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69e69734.050a0220.24bfd3.0027.GAE@google.com/ Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/util.c | 26 +++++++++++++++++--------- mm/vma.c | 3 ++- tools/testing/vma/include/dup.h | 2 +- tools/testing/vma/include/stubs.h | 3 ++- 5 files changed, 23 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b776907152e..af23453e9dbd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4391,7 +4391,7 @@ static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc, int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action); + struct mmap_action *action, bool is_compat); /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, diff --git a/mm/util.c b/mm/util.c index 232c3930a662..3cc949a0b7ed 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1232,7 +1232,7 @@ int __compat_vma_mmap(struct vm_area_desc *desc, /* Update the VMA from the descriptor. */ compat_set_vma_from_desc(vma, desc); /* Complete any specified mmap actions. */ - return mmap_action_complete(vma, &desc->action); + return mmap_action_complete(vma, &desc->action, /*is_compat=*/true); } EXPORT_SYMBOL(__compat_vma_mmap); @@ -1389,7 +1389,8 @@ static int call_vma_mapped(struct vm_area_struct *vma) } static int mmap_action_finish(struct vm_area_struct *vma, - struct mmap_action *action, int err) + struct mmap_action *action, int err, + bool is_compat) { size_t len; @@ -1400,8 +1401,12 @@ static int mmap_action_finish(struct vm_area_struct *vma, /* do_munmap() might take rmap lock, so release if held. */ maybe_rmap_unlock_action(vma, action); - if (!err) - return 0; + /* + * If this is invoked from the compatibility layer, post-mmap() hook + * logic will handle cleanup for us. + */ + if (!err || is_compat) + return err; /* * If an error occurs, unmap the VMA altogether and return an error. We @@ -1451,13 +1456,15 @@ EXPORT_SYMBOL(mmap_action_prepare); * mmap_action_complete - Execute VMA descriptor action. * @vma: The VMA to perform the action upon. * @action: The action to perform. + * @is_compat: Is this being invoked from the compatibility layer? * * Similar to mmap_action_prepare(). * - * Return: 0 on success, or error, at which point the VMA will be unmapped. + * Return: 0 on success, or error, at which point the VMA will be unmapped if + * !@is_compat. */ int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action) + struct mmap_action *action, bool is_compat) { int err = 0; @@ -1478,7 +1485,7 @@ int mmap_action_complete(struct vm_area_struct *vma, break; } - return mmap_action_finish(vma, action, err); + return mmap_action_finish(vma, action, err, is_compat); } EXPORT_SYMBOL(mmap_action_complete); #else @@ -1500,7 +1507,8 @@ int mmap_action_prepare(struct vm_area_desc *desc) EXPORT_SYMBOL(mmap_action_prepare); int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action) + struct mmap_action *action, + bool is_compat) { int err = 0; @@ -1517,7 +1525,7 @@ int mmap_action_complete(struct vm_area_struct *vma, break; } - return mmap_action_finish(vma, action, err); + return mmap_action_finish(vma, action, err, is_compat); } EXPORT_SYMBOL(mmap_action_complete); #endif diff --git a/mm/vma.c b/mm/vma.c index 377321b48734..d90791b00a7b 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2780,7 +2780,8 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); if (have_mmap_prepare && allocated_new) { - error = mmap_action_complete(vma, &desc.action); + error = mmap_action_complete(vma, &desc.action, + /*is_compat=*/false); if (error) return error; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index b4864aad2db0..9e0dfd3a85b0 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1330,7 +1330,7 @@ static inline int __compat_vma_mmap(struct vm_area_desc *desc, /* Update the VMA from the descriptor. */ compat_set_vma_from_desc(vma, desc); /* Complete any specified mmap actions. */ - return mmap_action_complete(vma, &desc->action); + return mmap_action_complete(vma, &desc->action, /*is_compat=*/true); } static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index a30b8bc84955..64164e25658f 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -87,7 +87,8 @@ static inline int mmap_action_prepare(struct vm_area_desc *desc) } static inline int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action) + struct mmap_action *action, + bool is_compat) { return 0; } -- cgit v1.2.3 From 77a50e9652ac3c669c6690088bce97d960f5fd17 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 22 Apr 2026 14:43:10 -0400 Subject: MAINTAINERS: update Liam's email address Switching to private email address. Update all contact information Add an entry to mailmap at the same time. Link: https://lore.kernel.org/20260422184310.2682901-1-liam@infradead.org Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 20 ++++++++++---------- include/linux/maple_tree.h | 2 +- lib/maple_tree.c | 2 +- lib/test_maple_tree.c | 4 ++-- tools/testing/radix-tree/maple.c | 2 +- 6 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/.mailmap b/.mailmap index a8fd13adf226..f1eafd91d2c2 100644 --- a/.mailmap +++ b/.mailmap @@ -496,6 +496,7 @@ Leon Romanovsky Leon Romanovsky Leon Romanovsky Leo Yan +Liam R. Howlett Liam Mark Linas Vepstas Linus Lüssing diff --git a/MAINTAINERS b/MAINTAINERS index d6f017581d54..be2e017b3a9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15399,7 +15399,7 @@ F: include/net/netns/mctp.h F: net/mctp/ MAPLE TREE -M: Liam R. Howlett +M: Liam R. Howlett R: Alice Ryhl R: Andrew Ballance L: maple-tree@lists.infradead.org @@ -16759,7 +16759,7 @@ MEMORY MANAGEMENT - CORE M: Andrew Morton M: David Hildenbrand R: Lorenzo Stoakes -R: Liam R. Howlett +R: Liam R. Howlett R: Vlastimil Babka R: Mike Rapoport R: Suren Baghdasaryan @@ -16895,7 +16895,7 @@ MEMORY MANAGEMENT - MISC M: Andrew Morton M: David Hildenbrand R: Lorenzo Stoakes -R: Liam R. Howlett +R: Liam R. Howlett R: Vlastimil Babka R: Mike Rapoport R: Suren Baghdasaryan @@ -16997,7 +16997,7 @@ M: Andrew Morton M: David Hildenbrand M: Lorenzo Stoakes R: Rik van Riel -R: Liam R. Howlett +R: Liam R. Howlett R: Vlastimil Babka R: Harry Yoo R: Jann Horn @@ -17044,7 +17044,7 @@ M: David Hildenbrand M: Lorenzo Stoakes R: Zi Yan R: Baolin Wang -R: Liam R. Howlett +R: Liam R. Howlett R: Nico Pache R: Ryan Roberts R: Dev Jain @@ -17082,7 +17082,7 @@ F: tools/testing/selftests/mm/uffd-*.[ch] MEMORY MANAGEMENT - RUST M: Alice Ryhl R: Lorenzo Stoakes -R: Liam R. Howlett +R: Liam R. Howlett L: linux-mm@kvack.org L: rust-for-linux@vger.kernel.org S: Maintained @@ -17096,7 +17096,7 @@ F: rust/kernel/page.rs MEMORY MAPPING M: Andrew Morton -M: Liam R. Howlett +M: Liam R. Howlett M: Lorenzo Stoakes R: Vlastimil Babka R: Jann Horn @@ -17128,7 +17128,7 @@ F: tools/testing/vma/ MEMORY MAPPING - LOCKING M: Andrew Morton M: Suren Baghdasaryan -M: Liam R. Howlett +M: Liam R. Howlett M: Lorenzo Stoakes R: Vlastimil Babka R: Shakeel Butt @@ -17143,7 +17143,7 @@ F: mm/mmap_lock.c MEMORY MAPPING - MADVISE (MEMORY ADVICE) M: Andrew Morton -M: Liam R. Howlett +M: Liam R. Howlett M: Lorenzo Stoakes M: David Hildenbrand R: Vlastimil Babka @@ -23370,7 +23370,7 @@ RUST [ALLOC] M: Danilo Krummrich R: Lorenzo Stoakes R: Vlastimil Babka -R: Liam R. Howlett +R: Liam R. Howlett R: Uladzislau Rezki L: rust-for-linux@vger.kernel.org S: Maintained diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0c464eade1d6..4a5631906aff 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -4,7 +4,7 @@ /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle - * Authors: Liam R. Howlett + * Authors: Liam R. Howlett * Matthew Wilcox */ diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d18d7ed9ab67..60ae5e6fc1ee 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2,7 +2,7 @@ /* * Maple Tree implementation * Copyright (c) 2018-2022 Oracle Corporation - * Authors: Liam R. Howlett + * Authors: Liam R. Howlett * Matthew Wilcox * Copyright (c) 2023 ByteDance * Author: Peng Zhang diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 434d8a2fdd99..b9367c61e8b5 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2,7 +2,7 @@ /* * test_maple_tree.c: Test the maple tree API * Copyright (c) 2018-2022 Oracle Corporation - * Author: Liam R. Howlett + * Author: Liam R. Howlett * * Any tests that only require the interface of the tree. */ @@ -4021,6 +4021,6 @@ static void __exit maple_tree_harvest(void) module_init(maple_tree_seed); module_exit(maple_tree_harvest); -MODULE_AUTHOR("Liam R. Howlett "); +MODULE_AUTHOR("Liam R. Howlett "); MODULE_DESCRIPTION("maple tree API test module"); MODULE_LICENSE("GPL"); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index feedd5ab7058..0607913a3022 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -2,7 +2,7 @@ /* * maple_tree.c: Userspace testing for maple tree test-suite * Copyright (c) 2018-2022 Oracle Corporation - * Author: Liam R. Howlett + * Author: Liam R. Howlett * * Any tests that require internal knowledge of the tree or threads and other * difficult to handle in kernel tests. -- cgit v1.2.3 From 5e25407b68f460142539536e31fa20338db6146f Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Apr 2026 19:41:03 +0200 Subject: mtd: spinand: Add support for packed read data ODTR commands Some devices stuff address bits in the double byte opcode (in place of the repeated byte) in order to be able to increase the size of the devices, without adding extra address bytes. Create a flag to identify those devices. When the flag is set, use the "packed" variant for the read data operation. Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/core.c | 24 +++++++++++++++++++++--- include/linux/mtd/spinand.h | 7 +++++++ 2 files changed, 28 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c index 8aa3753aaaa1..0b076790bd9d 100644 --- a/drivers/mtd/nand/spi/core.c +++ b/drivers/mtd/nand/spi/core.c @@ -100,6 +100,17 @@ spinand_fill_page_read_op(struct spinand_device *spinand, u64 addr) return op; } +static struct spi_mem_op +spinand_fill_page_read_packed_op(struct spinand_device *spinand, u64 addr) +{ + struct spi_mem_op op = spinand->op_templates->page_read; + + op.cmd.opcode |= addr >> 16; + op.addr.val = addr & 0xFFFF; + + return op; +} + struct spi_mem_op spinand_fill_prog_exec_op(struct spinand_device *spinand, u64 addr) { @@ -453,7 +464,10 @@ static int spinand_load_page_op(struct spinand_device *spinand, { struct nand_device *nand = spinand_to_nand(spinand); unsigned int row = nanddev_pos_to_row(nand, &req->pos); - struct spi_mem_op op = SPINAND_OP(spinand, page_read, row); + bool packed = spinand->flags & SPINAND_ODTR_PACKED_PAGE_READ; + struct spi_mem_op op = packed ? + SPINAND_OP(spinand, page_read_packed, row) : + SPINAND_OP(spinand, page_read, row); return spi_mem_exec_op(spinand->spimem, &op); } @@ -1489,9 +1503,13 @@ static int spinand_init_odtr_instruction_set(struct spinand_device *spinand) if (!spi_mem_supports_op(spinand->spimem, &tmpl->blk_erase)) return -EOPNOTSUPP; - tmpl->page_read = (struct spi_mem_op)SPINAND_PAGE_READ_8D_8D_0_OP(0); - if (!spi_mem_supports_op(spinand->spimem, &tmpl->page_read)) + if (spinand->flags & SPINAND_ODTR_PACKED_PAGE_READ) + tmpl->page_read = (struct spi_mem_op)SPINAND_PAGE_READ_PACKED_8D_8D_0_OP(0); + else + tmpl->page_read = (struct spi_mem_op)SPINAND_PAGE_READ_8D_8D_0_OP(0); + if (!spi_mem_supports_op(spinand->spimem, &tmpl->page_read)) { return -EOPNOTSUPP; + } tmpl->prog_exec = (struct spi_mem_op)SPINAND_PROG_EXEC_8D_8D_0_OP(0); if (!spi_mem_supports_op(spinand->spimem, &tmpl->prog_exec)) diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 58abd306ebe3..782984ba3a20 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -290,6 +290,12 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) +#define SPINAND_PAGE_READ_PACKED_8D_8D_0_OP(addr) \ + SPI_MEM_OP(SPI_MEM_DTR_OP_PACKED_CMD(0x13, addr >> 16, 8), \ + SPI_MEM_DTR_OP_ADDR(2, addr & 0xffff, 8), \ + SPI_MEM_OP_NO_DUMMY, \ + SPI_MEM_OP_NO_DATA) + #define SPINAND_PAGE_READ_FROM_CACHE_8D_8D_8D_OP(addr, ndummy, buf, len, freq) \ SPI_MEM_OP(SPI_MEM_DTR_OP_RPT_CMD(0x9d, 8), \ SPI_MEM_DTR_OP_ADDR(2, addr, 8), \ @@ -483,6 +489,7 @@ struct spinand_ecc_info { #define SPINAND_HAS_PROG_PLANE_SELECT_BIT BIT(2) #define SPINAND_HAS_READ_PLANE_SELECT_BIT BIT(3) #define SPINAND_NO_RAW_ACCESS BIT(4) +#define SPINAND_ODTR_PACKED_PAGE_READ BIT(5) /** * struct spinand_ondie_ecc_conf - private SPI-NAND on-die ECC engine structure -- cgit v1.2.3 From 0898a817621a2f0cddca8122d9b974003fe5036d Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 27 Apr 2026 22:01:39 +0100 Subject: cdrom, scsi: sr: propagate read-only status to block layer via set_disk_ro() The cdrom core never calls set_disk_ro() for a registered device, so BLKROGET on a CD-ROM device always returns 0 (writable), even when the drive has no write capabilities and writes will inevitably fail. This causes problems for userspace that relies on BLKROGET to determine whether a block device is read-only. For example, systemd's loop device setup uses BLKROGET to decide whether to create a loop device with LO_FLAGS_READ_ONLY. Without the read-only flag, writes pass through the loop device to the CD-ROM and fail with I/O errors. systemd-fsck similarly checks BLKROGET to decide whether to run fsck in no-repair mode (-n). The write-capability bits in cdi->mask come from two different sources: CDC_DVD_RAM and CDC_CD_RW are populated by the driver from the MODE SENSE capabilities page (page 0x2A) before register_cdrom() is called, while CDC_MRW_W and CDC_RAM require the MMC GET CONFIGURATION command and were only probed by cdrom_open_write() at device open time. This meant that any attempt to compute the writable state from the full mask at probe time was incorrect, because the GET CONFIGURATION bits were still unset (and cdi->mask is initialized such that capabilities are assumed present). Fix this by factoring the GET CONFIGURATION probing out of cdrom_open_write() into a new exported helper, cdrom_probe_write_features(), and having sr call it from sr_probe() right after get_capabilities() has populated the MODE SENSE bits. register_cdrom() then calls set_disk_ro() based on the full write-capability mask (CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | CDC_CD_RW) so the block layer reflects the drive's actual write support. The feature queries used (CDF_MRW and CDF_RWRT via GET CONFIGURATION with RT=00) report drive-level capabilities that are persistent across media, so a single probe before register_cdrom() is sufficient and the redundant probe at open time is dropped. With set_disk_ro() now accurate, the long-vestigial cd->writeable flag in sr can go: get_capabilities() used to set cd->writeable based on the same four mask bits, but because CDC_MRW_W and CDC_RAM default to "capability present" in cdi->mask and aren't touched by MODE SENSE, the condition that gated cd->writeable was always true, making it unconditionally 1. Replace the corresponding gate in sr_init_command() with get_disk_ro(cd->disk), which turns a previously no-op check into a real one and also catches kernel-internal bio writers that bypass blkdev_write_iter()'s bdev_read_only() check. The sd driver (SCSI disks) does not have this problem because it checks the MODE SENSE Write Protect bit and calls set_disk_ro() accordingly. The sr driver cannot use the same approach because the MMC specification does not define the WP bit in the MODE SENSE device-specific parameter byte for CD-ROM devices. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Daan De Meyer Reviewed-by: Phillip Potter Reviewed-by: Martin K. Petersen Signed-off-by: Phillip Potter Link: https://patch.msgid.link/20260427210139.1400-2-phil@philpotter.co.uk Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 73 +++++++++++++++++++++++++++++++++------------------ drivers/scsi/sr.c | 11 ++------ drivers/scsi/sr.h | 1 - include/linux/cdrom.h | 1 + 4 files changed, 51 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index fc049612d6dc..62934cf4b10d 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -631,6 +631,16 @@ int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi) WARN_ON(!cdo->generic_packet); + /* + * Propagate the drive's write support to the block layer so BLKROGET + * reflects actual write capability. Drivers that use GET CONFIGURATION + * features (CDC_MRW_W, CDC_RAM) must have called + * cdrom_probe_write_features() before register_cdrom() so the mask is + * complete here. + */ + set_disk_ro(disk, !CDROM_CAN(CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | + CDC_CD_RW)); + cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name); mutex_lock(&cdrom_mutex); list_add(&cdi->list, &cdrom_list); @@ -742,6 +752,44 @@ static int cdrom_is_random_writable(struct cdrom_device_info *cdi, int *write) return 0; } +/* + * Probe write-related MMC features via GET CONFIGURATION and update + * cdi->mask accordingly. Drivers that populate cdi->mask from the MODE SENSE + * capabilities page (e.g. sr) should call this after those MODE SENSE bits + * have been set but before register_cdrom(), so that the full set of + * write-capability bits is known by the time register_cdrom() decides on the + * initial read-only state of the disk. + */ +void cdrom_probe_write_features(struct cdrom_device_info *cdi) +{ + int mrw, mrw_write, ram_write; + + mrw = 0; + if (!cdrom_is_mrw(cdi, &mrw_write)) + mrw = 1; + + if (CDROM_CAN(CDC_MO_DRIVE)) + ram_write = 1; + else + (void) cdrom_is_random_writable(cdi, &ram_write); + + if (mrw) + cdi->mask &= ~CDC_MRW; + else + cdi->mask |= CDC_MRW; + + if (mrw_write) + cdi->mask &= ~CDC_MRW_W; + else + cdi->mask |= CDC_MRW_W; + + if (ram_write) + cdi->mask &= ~CDC_RAM; + else + cdi->mask |= CDC_RAM; +} +EXPORT_SYMBOL(cdrom_probe_write_features); + static int cdrom_media_erasable(struct cdrom_device_info *cdi) { disc_information di; @@ -894,33 +942,8 @@ static int cdrom_is_dvd_rw(struct cdrom_device_info *cdi) */ static int cdrom_open_write(struct cdrom_device_info *cdi) { - int mrw, mrw_write, ram_write; int ret = 1; - mrw = 0; - if (!cdrom_is_mrw(cdi, &mrw_write)) - mrw = 1; - - if (CDROM_CAN(CDC_MO_DRIVE)) - ram_write = 1; - else - (void) cdrom_is_random_writable(cdi, &ram_write); - - if (mrw) - cdi->mask &= ~CDC_MRW; - else - cdi->mask |= CDC_MRW; - - if (mrw_write) - cdi->mask &= ~CDC_MRW_W; - else - cdi->mask |= CDC_MRW_W; - - if (ram_write) - cdi->mask &= ~CDC_RAM; - else - cdi->mask |= CDC_RAM; - if (CDROM_CAN(CDC_MRW_W)) ret = cdrom_mrw_open_write(cdi); else if (CDROM_CAN(CDC_DVD_RAM)) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 7adb2573f50d..c36c54ecd354 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -395,7 +395,7 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) switch (req_op(rq)) { case REQ_OP_WRITE: - if (!cd->writeable) + if (get_disk_ro(cd->disk)) goto out; SCpnt->cmnd[0] = WRITE_10; cd->cdi.media_written = 1; @@ -681,6 +681,7 @@ static int sr_probe(struct scsi_device *sdev) error = -ENOMEM; if (get_capabilities(cd)) goto fail_minor; + cdrom_probe_write_features(&cd->cdi); sr_vendor_init(cd); set_capacity(disk, cd->capacity); @@ -899,14 +900,6 @@ static int get_capabilities(struct scsi_cd *cd) /*else I don't think it can close its tray cd->cdi.mask |= CDC_CLOSE_TRAY; */ - /* - * if DVD-RAM, MRW-W or CD-RW, we are randomly writable - */ - if ((cd->cdi.mask & (CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | CDC_CD_RW)) != - (CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | CDC_CD_RW)) { - cd->writeable = 1; - } - kfree(buffer); return 0; } diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h index dc899277b3a4..2d92f9cb6fec 100644 --- a/drivers/scsi/sr.h +++ b/drivers/scsi/sr.h @@ -35,7 +35,6 @@ typedef struct scsi_cd { struct scsi_device *device; unsigned int vendor; /* vendor code, see sr_vendor.c */ unsigned long ms_offset; /* for reading multisession-CD's */ - unsigned writeable : 1; unsigned use:1; /* is this device still supportable */ unsigned xa_flag:1; /* CD has XA sectors ? */ unsigned readcd_known:1; /* drive supports READ_CD (0xbe) */ diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index b907e6c2307d..260d7968cf72 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -108,6 +108,7 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi, unsigned int clearing); +extern void cdrom_probe_write_features(struct cdrom_device_info *cdi); extern int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi); extern void unregister_cdrom(struct cdrom_device_info *cdi); -- cgit v1.2.3 From 0de4cb473aed57ee4ba7e0551ad27bddc19fc519 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 28 Apr 2026 08:10:43 -0700 Subject: workqueue: fix devm_alloc_workqueue() va_list misuse devm_alloc_workqueue() built a va_list and passed it as a single positional argument to the variadic alloc_workqueue() macro: va_start(args, max_active); wq = alloc_workqueue(fmt, flags, max_active, args); va_end(args); C does not allow forwarding a va_list through a ... parameter. alloc_workqueue() expands to alloc_workqueue_noprof(), which runs its own va_start() over its ... params, so the inner vsnprintf(wq->name, sizeof(wq->name), fmt, args) in __alloc_workqueue() received the outer va_list object as the first variadic slot rather than the caller's actual format arguments. Add a new static helper alloc_workqueue_va() that wraps __alloc_workqueue() and runs wq_init_lockdep() on success, and fold both alloc_workqueue_noprof() and devm_alloc_workqueue_noprof() onto it as suggested by Tejun. The wq_init_lockdep() step is required on the devm path too, otherwise __flush_workqueue()'s on-stack COMPLETION_INITIALIZER_ONSTACK_MAP would NULL-deref wq->lockdep_map. No caller changes are required. devm_alloc_ordered_workqueue() is a macro forwarding to devm_alloc_workqueue() and inherits the fix. Two in-tree callers actively trigger the broken path on every probe: drivers/power/supply/mt6370-charger.c:889 drivers/power/supply/max77705_charger.c:649 both of which use devm_alloc_ordered_workqueue(dev, "%s", 0, dev_name(dev)). A standalone reproducer module is available at[1]. Link: https://github.com/leitao/debug/blob/main/workqueue/valist/wq_va_test.c [1] Fixes: 1dfc9d60a69e ("workqueue: devres: Add device-managed allocate workqueue") Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 6 ++++-- kernel/workqueue.c | 28 +++++++++++++++++++--------- 2 files changed, 23 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index ab6cb70ca1a5..6177624539b3 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -534,8 +534,10 @@ alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...) * Pointer to the allocated workqueue on success, %NULL on failure. */ __printf(2, 5) struct workqueue_struct * -devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, - int max_active, ...); +devm_alloc_workqueue_noprof(struct device *dev, const char *fmt, + unsigned int flags, int max_active, ...); +#define devm_alloc_workqueue(...) \ + alloc_hooks(devm_alloc_workqueue_noprof(__VA_ARGS__)) #ifdef CONFIG_LOCKDEP /** diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f747f241a5f..24d0265191d4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5906,6 +5906,20 @@ err_destroy: return NULL; } +static struct workqueue_struct *alloc_workqueue_va(const char *fmt, + unsigned int flags, + int max_active, + va_list args) +{ + struct workqueue_struct *wq; + + wq = __alloc_workqueue(fmt, flags, max_active, args); + if (wq) + wq_init_lockdep(wq); + + return wq; +} + __printf(1, 4) struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, unsigned int flags, @@ -5915,12 +5929,8 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, va_list args; va_start(args, max_active); - wq = __alloc_workqueue(fmt, flags, max_active, args); + wq = alloc_workqueue_va(fmt, flags, max_active, args); va_end(args); - if (!wq) - return NULL; - - wq_init_lockdep(wq); return wq; } @@ -5932,15 +5942,15 @@ static void devm_workqueue_release(void *res) } __printf(2, 5) struct workqueue_struct * -devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, - int max_active, ...) +devm_alloc_workqueue_noprof(struct device *dev, const char *fmt, + unsigned int flags, int max_active, ...) { struct workqueue_struct *wq; va_list args; int ret; va_start(args, max_active); - wq = alloc_workqueue(fmt, flags, max_active, args); + wq = alloc_workqueue_va(fmt, flags, max_active, args); va_end(args); if (!wq) return NULL; @@ -5951,7 +5961,7 @@ devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, return wq; } -EXPORT_SYMBOL_GPL(devm_alloc_workqueue); +EXPORT_SYMBOL_GPL(devm_alloc_workqueue_noprof); #ifdef CONFIG_LOCKDEP __printf(1, 5) -- cgit v1.2.3 From 6813985ca456d1f5677ad9554f55805cbf27e16f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Apr 2026 17:35:18 +0200 Subject: netfilter: x_tables: add .check_hooks to matches and targets Add a new .check_hooks interface for checking if the match/target is used from the validate hook according to its configuration. Move existing conditional hook check based on the match/target configuration from .checkentry to .check_hooks for the following matches/targets: - addrtype - devgroup - physdev - policy - set - TCPMSS - SET This is a preparation patch to fix nft_compat, not functional changes are intended. Based on patch from Florian Westphal. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 8 ++++ net/netfilter/x_tables.c | 79 ++++++++++++++++++++++++++++++++++---- net/netfilter/xt_TCPMSS.c | 33 ++++++++-------- net/netfilter/xt_addrtype.c | 25 +++++++++--- net/netfilter/xt_devgroup.c | 18 ++++++--- net/netfilter/xt_physdev.c | 20 +++++++--- net/netfilter/xt_policy.c | 24 +++++++++--- net/netfilter/xt_set.c | 39 ++++++++++++------- 8 files changed, 187 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 77c778d84d4c..a81b46af5118 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -146,6 +146,9 @@ struct xt_match { /* Called when user tries to insert an entry of this type. */ int (*checkentry)(const struct xt_mtchk_param *); + /* Called to validate hooks based on the match configuration. */ + int (*check_hooks)(const struct xt_mtchk_param *); + /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -187,6 +190,9 @@ struct xt_target { /* Should return 0 on success or an error code otherwise (-Exxxx). */ int (*checkentry)(const struct xt_tgchk_param *); + /* Called to validate hooks based on the target configuration. */ + int (*check_hooks)(const struct xt_tgchk_param *); + /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -279,8 +285,10 @@ bool xt_find_jump_offset(const unsigned int *offsets, int xt_check_proc_name(const char *name, unsigned int size); +int xt_check_hooks_match(struct xt_mtchk_param *par); int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto, bool inv_proto); +int xt_check_hooks_target(struct xt_tgchk_param *par); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto, bool inv_proto); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 9f837fb5ceb4..2c67c2e6b132 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -477,11 +477,9 @@ int xt_check_proc_name(const char *name, unsigned int size) } EXPORT_SYMBOL(xt_check_proc_name); -int xt_check_match(struct xt_mtchk_param *par, - unsigned int size, u16 proto, bool inv_proto) +static int xt_check_match_common(struct xt_mtchk_param *par, + unsigned int size, u16 proto, bool inv_proto) { - int ret; - if (XT_ALIGN(par->match->matchsize) != size && par->match->matchsize != -1) { /* @@ -530,6 +528,14 @@ int xt_check_match(struct xt_mtchk_param *par, par->match->proto); return -EINVAL; } + + return 0; +} + +static int xt_checkentry_match(struct xt_mtchk_param *par) +{ + int ret; + if (par->match->checkentry != NULL) { ret = par->match->checkentry(par); if (ret < 0) @@ -538,8 +544,34 @@ int xt_check_match(struct xt_mtchk_param *par, /* Flag up potential errors. */ return -EIO; } + + return 0; +} + +int xt_check_hooks_match(struct xt_mtchk_param *par) +{ + if (par->match->check_hooks != NULL) + return par->match->check_hooks(par); + return 0; } +EXPORT_SYMBOL_GPL(xt_check_hooks_match); + +int xt_check_match(struct xt_mtchk_param *par, + unsigned int size, u16 proto, bool inv_proto) +{ + int ret; + + ret = xt_check_match_common(par, size, proto, inv_proto); + if (ret < 0) + return ret; + + ret = xt_check_hooks_match(par); + if (ret < 0) + return ret; + + return xt_checkentry_match(par); +} EXPORT_SYMBOL_GPL(xt_check_match); /** xt_check_entry_match - check that matches end before start of target @@ -1012,11 +1044,9 @@ bool xt_find_jump_offset(const unsigned int *offsets, } EXPORT_SYMBOL(xt_find_jump_offset); -int xt_check_target(struct xt_tgchk_param *par, - unsigned int size, u16 proto, bool inv_proto) +static int xt_check_target_common(struct xt_tgchk_param *par, + unsigned int size, u16 proto, bool inv_proto) { - int ret; - if (XT_ALIGN(par->target->targetsize) != size) { pr_err_ratelimited("%s_tables: %s.%u target: invalid size %u (kernel) != (user) %u\n", xt_prefix[par->family], par->target->name, @@ -1061,6 +1091,23 @@ int xt_check_target(struct xt_tgchk_param *par, par->target->proto); return -EINVAL; } + + return 0; +} + +int xt_check_hooks_target(struct xt_tgchk_param *par) +{ + if (par->target->check_hooks != NULL) + return par->target->check_hooks(par); + + return 0; +} +EXPORT_SYMBOL_GPL(xt_check_hooks_target); + +static int xt_checkentry_target(struct xt_tgchk_param *par) +{ + int ret; + if (par->target->checkentry != NULL) { ret = par->target->checkentry(par); if (ret < 0) @@ -1071,6 +1118,22 @@ int xt_check_target(struct xt_tgchk_param *par, } return 0; } + +int xt_check_target(struct xt_tgchk_param *par, + unsigned int size, u16 proto, bool inv_proto) +{ + int ret; + + ret = xt_check_target_common(par, size, proto, inv_proto); + if (ret < 0) + return ret; + + ret = xt_check_hooks_target(par); + if (ret < 0) + return ret; + + return xt_checkentry_target(par); +} EXPORT_SYMBOL_GPL(xt_check_target); /** diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 116a885adb3c..80e1634bc51f 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -247,6 +247,21 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) } #endif +static int tcpmss_tg4_check_hooks(const struct xt_tgchk_param *par) +{ + const struct xt_tcpmss_info *info = par->targinfo; + + if (info->mss == XT_TCPMSS_CLAMP_PMTU && + (par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { + pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); + return -EINVAL; + } + + return 0; +} + /* Must specify -p tcp --syn */ static inline bool find_syn_match(const struct xt_entry_match *m) { @@ -262,17 +277,9 @@ static inline bool find_syn_match(const struct xt_entry_match *m) static int tcpmss_tg4_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = par->targinfo; const struct ipt_entry *e = par->entryinfo; const struct xt_entry_match *ematch; - if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (par->hook_mask & ~((1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING))) != 0) { - pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); - return -EINVAL; - } if (par->nft_compat) return 0; @@ -286,17 +293,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int tcpmss_tg6_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = par->targinfo; const struct ip6t_entry *e = par->entryinfo; const struct xt_entry_match *ematch; - if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (par->hook_mask & ~((1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING))) != 0) { - pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); - return -EINVAL; - } if (par->nft_compat) return 0; @@ -312,6 +311,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV4, .name = "TCPMSS", + .check_hooks = tcpmss_tg4_check_hooks, .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), @@ -322,6 +322,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV6, .name = "TCPMSS", + .check_hooks = tcpmss_tg4_check_hooks, .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index a77088943107..913dbe3aa5e2 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -153,14 +153,10 @@ addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) return ret; } -static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +static int addrtype_mt_check_hooks(const struct xt_mtchk_param *par) { - const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; struct xt_addrtype_info_v1 *info = par->matchinfo; - - if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && - info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) - goto err; + const char *errmsg; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && @@ -176,6 +172,21 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) goto err; } + return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; +} + +static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +{ + const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; + struct xt_addrtype_info_v1 *info = par->matchinfo; + + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) + goto err; + #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) if (par->family == NFPROTO_IPV6) { if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { @@ -211,6 +222,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, + .check_hooks = addrtype_mt_check_hooks, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE @@ -221,6 +233,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .revision = 1, .match = addrtype_mt_v1, + .check_hooks = addrtype_mt_check_hooks, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c index 9520dd00070b..6d1a44ab5eee 100644 --- a/net/netfilter/xt_devgroup.c +++ b/net/netfilter/xt_devgroup.c @@ -33,14 +33,10 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) return true; } -static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +static int devgroup_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_devgroup_info *info = par->matchinfo; - if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | - XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) - return -EINVAL; - if (info->flags & XT_DEVGROUP_MATCH_SRC && par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | @@ -56,9 +52,21 @@ static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) return 0; } +static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +{ + const struct xt_devgroup_info *info = par->matchinfo; + + if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | + XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) + return -EINVAL; + + return 0; +} + static struct xt_match devgroup_mt_reg __read_mostly = { .name = "devgroup", .match = devgroup_mt, + .check_hooks = devgroup_mt_check_hooks, .checkentry = devgroup_mt_checkentry, .matchsize = sizeof(struct xt_devgroup_info), .family = NFPROTO_UNSPEC, diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index d2b0b52434fa..dd98f758176c 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -91,14 +91,10 @@ match_outdev: return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT)); } -static int physdev_mt_check(const struct xt_mtchk_param *par) +static int physdev_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_physdev_info *info = par->matchinfo; - static bool brnf_probed __read_mostly; - if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || - info->bitmask & ~XT_PHYSDEV_OP_MASK) - return -EINVAL; if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && @@ -107,6 +103,18 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) return -EINVAL; } + return 0; +} + +static int physdev_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_physdev_info *info = par->matchinfo; + static bool brnf_probed __read_mostly; + + if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || + info->bitmask & ~XT_PHYSDEV_OP_MASK) + return -EINVAL; + #define X(memb) strnlen(info->memb, sizeof(info->memb)) >= sizeof(info->memb) if (info->bitmask & XT_PHYSDEV_OP_IN) { if (info->physindev[0] == '\0') @@ -141,6 +149,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", .family = NFPROTO_IPV4, + .check_hooks = physdev_mt_check_hooks, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), @@ -149,6 +158,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", .family = NFPROTO_IPV6, + .check_hooks = physdev_mt_check_hooks, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index b5fa65558318..ff54e3a8581e 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -126,13 +126,10 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par) return ret; } -static int policy_mt_check(const struct xt_mtchk_param *par) +static int policy_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_policy_info *info = par->matchinfo; - const char *errmsg = "neither incoming nor outgoing policy selected"; - - if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) - goto err; + const char *errmsg; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { @@ -144,6 +141,21 @@ static int policy_mt_check(const struct xt_mtchk_param *par) errmsg = "input policy not valid in POSTROUTING and OUTPUT"; goto err; } + + return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; +} + +static int policy_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_policy_info *info = par->matchinfo; + const char *errmsg = "neither incoming nor outgoing policy selected"; + + if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) + goto err; + if (info->len > XT_POLICY_MAX_ELEM) { errmsg = "too many policy elements"; goto err; @@ -158,6 +170,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV4, + .check_hooks = policy_mt_check_hooks, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), @@ -166,6 +179,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV6, + .check_hooks = policy_mt_check_hooks, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 731bc2cafae4..4ae04bba9358 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -430,6 +430,29 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } +static int +set_target_v3_check_hooks(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + + if (info->map_set.index != IPSET_INVALID_ID) { + if (strncmp(par->table, "mangle", 7)) { + pr_info_ratelimited("--map-set only usable from mangle table\n"); + return -EINVAL; + } + if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | + (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && + (par->hook_mask & ~(1 << NF_INET_FORWARD | + 1 << NF_INET_LOCAL_OUT | + 1 << NF_INET_POST_ROUTING))) { + pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); + return -EINVAL; + } + } + + return 0; +} + static int set_target_v3_checkentry(const struct xt_tgchk_param *par) { @@ -459,20 +482,6 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) } if (info->map_set.index != IPSET_INVALID_ID) { - if (strncmp(par->table, "mangle", 7)) { - pr_info_ratelimited("--map-set only usable from mangle table\n"); - ret = -EINVAL; - goto cleanup_del; - } - if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | - (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && - (par->hook_mask & ~(1 << NF_INET_FORWARD | - 1 << NF_INET_LOCAL_OUT | - 1 << NF_INET_POST_ROUTING))) { - pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); - ret = -EINVAL; - goto cleanup_del; - } index = ip_set_nfnl_get_byindex(par->net, info->map_set.index); if (index == IPSET_INVALID_ID) { @@ -672,6 +681,7 @@ static struct xt_target set_targets[] __read_mostly = { .family = NFPROTO_IPV4, .target = set_target_v3, .targetsize = sizeof(struct xt_set_info_target_v3), + .check_hooks = set_target_v3_check_hooks, .checkentry = set_target_v3_checkentry, .destroy = set_target_v3_destroy, .me = THIS_MODULE @@ -682,6 +692,7 @@ static struct xt_target set_targets[] __read_mostly = { .family = NFPROTO_IPV6, .target = set_target_v3, .targetsize = sizeof(struct xt_set_info_target_v3), + .check_hooks = set_target_v3_check_hooks, .checkentry = set_target_v3_checkentry, .destroy = set_target_v3_destroy, .me = THIS_MODULE -- cgit v1.2.3 From e768103cfbac30a49860aca08a7710d39dbdd470 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 29 Apr 2026 15:43:36 +0200 Subject: smb: smbdirect: introduce and use include/linux/smbdirect.h This makes it easier to rebuild cifs.ko and ksmbd.ko against a running kernel. Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/linux-cifs/aehrPuY60VMcYGU8@infradead.org/ Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: Christoph Hellwig Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- MAINTAINERS | 1 + fs/smb/client/smbdirect.c | 1 - fs/smb/client/smbdirect.h | 2 +- fs/smb/server/transport_rdma.c | 1 - fs/smb/server/transport_rdma.h | 2 +- fs/smb/smbdirect/internal.h | 3 +- fs/smb/smbdirect/public.h | 146 -------------------------------- fs/smb/smbdirect/smbdirect.h | 52 ------------ include/linux/smbdirect.h | 186 +++++++++++++++++++++++++++++++++++++++++ 9 files changed, 190 insertions(+), 204 deletions(-) delete mode 100644 fs/smb/smbdirect/public.h delete mode 100644 fs/smb/smbdirect/smbdirect.h create mode 100644 include/linux/smbdirect.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 2c67ee25ffe6..060dca38dbf7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24650,6 +24650,7 @@ S: Maintained F: fs/smb/client/smbdirect.* F: fs/smb/smbdirect/ F: fs/smb/server/transport_rdma.* +F: include/linux/smbdirect.h SMC91x ETHERNET DRIVER M: Nicolas Pitre diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index b9826185de18..563ef488a225 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -9,7 +9,6 @@ #include "cifs_debug.h" #include "cifsproto.h" #include "smb2proto.h" -#include "../smbdirect/public.h" /* Port numbers for SMBD transport */ #define SMB_PORT 445 diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 287ac849213d..be205ec02077 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -12,7 +12,7 @@ #include "cifsglob.h" -#include "../smbdirect/smbdirect.h" +#include extern int rdma_readwrite_threshold; extern int smbd_max_frmr_depth; diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 346c051e31f5..b6d63ff8a8a3 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -18,7 +18,6 @@ #include "smb_common.h" #include "../common/smb2status.h" #include "transport_rdma.h" -#include "../smbdirect/public.h" #define SMB_DIRECT_PORT_IWARP 5445 diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h index bde3d88aecc7..8b78917a1795 100644 --- a/fs/smb/server/transport_rdma.h +++ b/fs/smb/server/transport_rdma.h @@ -25,6 +25,6 @@ static inline void init_smbd_max_io_size(unsigned int sz) { } static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; } #endif -#include "../smbdirect/smbdirect.h" +#include #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ diff --git a/fs/smb/smbdirect/internal.h b/fs/smb/smbdirect/internal.h index 82529b41708b..e9959e6dc13a 100644 --- a/fs/smb/smbdirect/internal.h +++ b/fs/smb/smbdirect/internal.h @@ -9,9 +9,8 @@ #define DEFAULT_SYMBOL_NAMESPACE "SMBDIRECT" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include "smbdirect.h" +#include #include "pdu.h" -#include "public.h" #include diff --git a/fs/smb/smbdirect/public.h b/fs/smb/smbdirect/public.h deleted file mode 100644 index d4fb36e65254..000000000000 --- a/fs/smb/smbdirect/public.h +++ /dev/null @@ -1,146 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2025, Stefan Metzmacher - */ - -#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ -#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ - -struct smbdirect_buffer_descriptor_v1; -struct smbdirect_socket_parameters; - -struct smbdirect_socket; -struct smbdirect_send_batch; -struct smbdirect_mr_io; - -#include - -u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev); - -bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs); - -int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc); - -int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc); - -int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, - const struct smbdirect_socket_parameters *sp); - -const struct smbdirect_socket_parameters * -smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc); - -int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, - enum ib_poll_context poll_ctx, - gfp_t gfp_mask); - -#define SMBDIRECT_LOG_ERR 0x0 -#define SMBDIRECT_LOG_INFO 0x1 - -#define SMBDIRECT_LOG_OUTGOING 0x1 -#define SMBDIRECT_LOG_INCOMING 0x2 -#define SMBDIRECT_LOG_READ 0x4 -#define SMBDIRECT_LOG_WRITE 0x8 -#define SMBDIRECT_LOG_RDMA_SEND 0x10 -#define SMBDIRECT_LOG_RDMA_RECV 0x20 -#define SMBDIRECT_LOG_KEEP_ALIVE 0x40 -#define SMBDIRECT_LOG_RDMA_EVENT 0x80 -#define SMBDIRECT_LOG_RDMA_MR 0x100 -#define SMBDIRECT_LOG_RDMA_RW 0x200 -#define SMBDIRECT_LOG_NEGOTIATE 0x400 -void smbdirect_socket_set_logging(struct smbdirect_socket *sc, - void *private_ptr, - bool (*needed)(struct smbdirect_socket *sc, - void *private_ptr, - unsigned int lvl, - unsigned int cls), - void (*vaprintf)(struct smbdirect_socket *sc, - const char *func, - unsigned int line, - void *private_ptr, - unsigned int lvl, - unsigned int cls, - struct va_format *vaf)); - -bool smbdirect_connection_is_connected(struct smbdirect_socket *sc); - -int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc); - -int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr); - -void smbdirect_socket_shutdown(struct smbdirect_socket *sc); - -void smbdirect_socket_release(struct smbdirect_socket *sc); - -int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch, - bool is_last); - -/* - * This is only temporary and only needed - * as long as the client still requires - * to use smbdirect_connection_send_single_iter() - */ -struct smbdirect_send_batch_storage { - union { - struct list_head __msg_list; - __aligned_u64 __space[5]; - }; -}; - -struct smbdirect_send_batch * -smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, - bool need_invalidate_rkey, - unsigned int remote_key); - -int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch, - struct iov_iter *iter, - unsigned int flags, - u32 remaining_data_length); - -int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc); - -int smbdirect_connection_send_iter(struct smbdirect_socket *sc, - struct iov_iter *iter, - unsigned int flags, - bool need_invalidate, - unsigned int remote_key); - -int smbdirect_connection_recvmsg(struct smbdirect_socket *sc, - struct msghdr *msg, - unsigned int flags); - -int smbdirect_connect(struct smbdirect_socket *sc, - const struct sockaddr *dst); - -int smbdirect_connect_sync(struct smbdirect_socket *sc, - const struct sockaddr *dst); - -int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog); - -struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc, - long timeo, - struct proto_accept_arg *arg); - -int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc, - void *buf, size_t buf_len, - struct smbdirect_buffer_descriptor_v1 *desc, - size_t desc_len, - bool is_read); - -struct smbdirect_mr_io * -smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, - struct iov_iter *iter, - bool writing, - bool need_invalidate); - -void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, - struct smbdirect_buffer_descriptor_v1 *v1); - -void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr); - -void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, - unsigned int rdma_readwrite_threshold, - struct seq_file *m); - -#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ */ diff --git a/fs/smb/smbdirect/smbdirect.h b/fs/smb/smbdirect/smbdirect.h deleted file mode 100644 index bbab5f7f7cc9..000000000000 --- a/fs/smb/smbdirect/smbdirect.h +++ /dev/null @@ -1,52 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2025 Stefan Metzmacher - */ - -#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ -#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ - -#include - -/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */ -struct smbdirect_buffer_descriptor_v1 { - __le64 offset; - __le32 token; - __le32 length; -} __packed; - -/* - * Connection parameters mostly from [MS-SMBD] 3.1.1.1 - * - * These are setup and negotiated at the beginning of a - * connection and remain constant unless explicitly changed. - * - * Some values are important for the upper layer. - */ -struct smbdirect_socket_parameters { - __u64 flags; -#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1) -#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2) - __u32 resolve_addr_timeout_msec; - __u32 resolve_route_timeout_msec; - __u32 rdma_connect_timeout_msec; - __u32 negotiate_timeout_msec; - __u16 initiator_depth; /* limited to U8_MAX */ - __u16 responder_resources; /* limited to U8_MAX */ - __u16 recv_credit_max; - __u16 send_credit_target; - __u32 max_send_size; - __u32 max_fragmented_send_size; - __u32 max_recv_size; - __u32 max_fragmented_recv_size; - __u32 max_read_write_size; - __u32 max_frmr_depth; - __u32 keepalive_interval_msec; - __u32 keepalive_timeout_msec; -} __packed; - -#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \ - SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \ - SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW) - -#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ */ diff --git a/include/linux/smbdirect.h b/include/linux/smbdirect.h new file mode 100644 index 000000000000..97f5ba730fa7 --- /dev/null +++ b/include/linux/smbdirect.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2025, Stefan Metzmacher + */ + +#ifndef __LINUX_SMBDIRECT_H__ +#define __LINUX_SMBDIRECT_H__ + +#include + +/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */ +struct smbdirect_buffer_descriptor_v1 { + __le64 offset; + __le32 token; + __le32 length; +} __packed; + +/* + * Connection parameters mostly from [MS-SMBD] 3.1.1.1 + * + * These are setup and negotiated at the beginning of a + * connection and remain constant unless explicitly changed. + * + * Some values are important for the upper layer. + */ +struct smbdirect_socket_parameters { + __u64 flags; +#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1) +#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2) + __u32 resolve_addr_timeout_msec; + __u32 resolve_route_timeout_msec; + __u32 rdma_connect_timeout_msec; + __u32 negotiate_timeout_msec; + __u16 initiator_depth; /* limited to U8_MAX */ + __u16 responder_resources; /* limited to U8_MAX */ + __u16 recv_credit_max; + __u16 send_credit_target; + __u32 max_send_size; + __u32 max_fragmented_send_size; + __u32 max_recv_size; + __u32 max_fragmented_recv_size; + __u32 max_read_write_size; + __u32 max_frmr_depth; + __u32 keepalive_interval_msec; + __u32 keepalive_timeout_msec; +} __packed; + +#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \ + SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \ + SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW) + +struct smbdirect_socket; +struct smbdirect_send_batch; +struct smbdirect_mr_io; + +#include + +u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev); + +bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs); + +int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc); + +int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc); + +int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, + const struct smbdirect_socket_parameters *sp); + +const struct smbdirect_socket_parameters * +smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc); + +int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, + enum ib_poll_context poll_ctx, + gfp_t gfp_mask); + +#define SMBDIRECT_LOG_ERR 0x0 +#define SMBDIRECT_LOG_INFO 0x1 + +#define SMBDIRECT_LOG_OUTGOING 0x1 +#define SMBDIRECT_LOG_INCOMING 0x2 +#define SMBDIRECT_LOG_READ 0x4 +#define SMBDIRECT_LOG_WRITE 0x8 +#define SMBDIRECT_LOG_RDMA_SEND 0x10 +#define SMBDIRECT_LOG_RDMA_RECV 0x20 +#define SMBDIRECT_LOG_KEEP_ALIVE 0x40 +#define SMBDIRECT_LOG_RDMA_EVENT 0x80 +#define SMBDIRECT_LOG_RDMA_MR 0x100 +#define SMBDIRECT_LOG_RDMA_RW 0x200 +#define SMBDIRECT_LOG_NEGOTIATE 0x400 +void smbdirect_socket_set_logging(struct smbdirect_socket *sc, + void *private_ptr, + bool (*needed)(struct smbdirect_socket *sc, + void *private_ptr, + unsigned int lvl, + unsigned int cls), + void (*vaprintf)(struct smbdirect_socket *sc, + const char *func, + unsigned int line, + void *private_ptr, + unsigned int lvl, + unsigned int cls, + struct va_format *vaf)); + +bool smbdirect_connection_is_connected(struct smbdirect_socket *sc); + +int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc); + +int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr); + +void smbdirect_socket_shutdown(struct smbdirect_socket *sc); + +void smbdirect_socket_release(struct smbdirect_socket *sc); + +int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + bool is_last); + +/* + * This is only temporary and only needed + * as long as the client still requires + * to use smbdirect_connection_send_single_iter() + */ +struct smbdirect_send_batch_storage { + union { + struct list_head __msg_list; + __aligned_u64 __space[5]; + }; +}; + +struct smbdirect_send_batch * +smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, + bool need_invalidate_rkey, + unsigned int remote_key); + +int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + struct iov_iter *iter, + unsigned int flags, + u32 remaining_data_length); + +int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc); + +int smbdirect_connection_send_iter(struct smbdirect_socket *sc, + struct iov_iter *iter, + unsigned int flags, + bool need_invalidate, + unsigned int remote_key); + +int smbdirect_connection_recvmsg(struct smbdirect_socket *sc, + struct msghdr *msg, + unsigned int flags); + +int smbdirect_connect(struct smbdirect_socket *sc, + const struct sockaddr *dst); + +int smbdirect_connect_sync(struct smbdirect_socket *sc, + const struct sockaddr *dst); + +int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog); + +struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc, + long timeo, + struct proto_accept_arg *arg); + +int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc, + void *buf, size_t buf_len, + struct smbdirect_buffer_descriptor_v1 *desc, + size_t desc_len, + bool is_read); + +struct smbdirect_mr_io * +smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, + struct iov_iter *iter, + bool writing, + bool need_invalidate); + +void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, + struct smbdirect_buffer_descriptor_v1 *v1); + +void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr); + +void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, + unsigned int rdma_readwrite_threshold, + struct seq_file *m); + +#endif /* __LINUX_SMBDIRECT_H__ */ -- cgit v1.2.3 From 93618edf753838a727dbff63c7c291dee22d656b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 1 May 2026 08:31:22 -1000 Subject: cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated A chain of commits going back to v7.0 reworked rmdir to satisfy the controller invariant that a subsystem's ->css_offline() must not run while tasks are still doing kernel-side work in the cgroup. [1] d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out") [2] a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") [3] 1b164b876c36 ("cgroup: Wait for dying tasks to leave on rmdir") [4] 4c56a8ac6869 ("cgroup: Fix cgroup_drain_dying() testing the wrong condition") [5] 13e786b64bd3 ("cgroup: Increment nr_dying_subsys_* from rmdir context") [1] moved task cset unlink from do_exit() to finish_task_switch() so a task's cset link drops only after the task has fully stopped scheduling. That made tasks past exit_signals() linger on cset->tasks until their final context switch, which led to a series of problems as what userspace expected to see after rmdir diverged from what the kernel needs to wait for. [2]-[5] tried to bridge that divergence: [2] filtered the exiting tasks from cgroup.procs; [3] had rmdir(2) sleep in TASK_UNINTERRUPTIBLE for them; [4] fixed the wait's condition; [5] made nr_dying_subsys_* visible synchronously. The cgroup_drain_dying() wait in [3] turned out to be a dead end. When the rmdir caller is also the reaper of a zombie that pins a pidns teardown (e.g. host PID 1 systemd reaping orphan pids that were re-parented to it during the same teardown), rmdir blocks in TASK_UNINTERRUPTIBLE waiting for those pids to free, the pids can't free because PID 1 is the reaper and it's stuck in rmdir, and the system A-A deadlocks. No internal lock ordering breaks this; the wait itself is the bug. The css killing side that drove the original reorder, however, can be made cleanly asynchronous: ->css_offline() is already async, run from css_killed_work_fn() driven by percpu_ref_kill_and_confirm(). The fix is to make that chain start only after all tasks have left the cgroup. rmdir's user-visible side then returns as soon as cgroup.procs and friends are empty, while ->css_offline() still runs only after the cgroup is fully drained. Verified by the original reproducer (pidns teardown + zombie reaper, runs under vng) which hangs vanilla and succeeds here, and by per-commit deterministic repros for [2], [3], [4], [5] with a boot parameter that widens the post-exit_signals() window so each state is reliably reachable. Some stress tests on top of that. cgroup_apply_control_disable() has the same shape of pre-existing race: when a controller is disabled via subtree_control, kill_css() ran synchronously while tasks past exit_signals() could still be linked to the cgroup's csets, and ->css_offline() could fire before they drained. This patch preserves the existing synchronous behavior at that call site (kill_css_sync() + kill_css_finish() back-to-back) and a follow-up patch will defer kill_css_finish() there using a per-css trigger. This seems like the right approach and I don't see problems with it. The changes are somewhat invasive but not excessively so, so backporting to -stable should be okay. If something does turn out to be wrong, the fallback is to revert the entire chain ([1]-[5]) and rework in the development branch instead. v2: Pin cgrp across the deferred destroy work with explicit cgroup_get()/cgroup_put() around queue_work() and the work_fn. v1 wasn't actually broken (ordered cgroup_offline_wq + queue_work order in cgroup_task_dead() saved it) but the explicit ref removes the dependency on those non-obvious invariants. Also note the pre-existing cgroup_apply_control_disable() race in the description; a follow-up will defer kill_css_finish() there. Fixes: 1b164b876c36 ("cgroup: Wait for dying tasks to leave on rmdir") Cc: stable@vger.kernel.org # v7.0+ Reported-and-tested-by: Martin Pitt Link: https://lore.kernel.org/all/afHNg2VX2jy9bW7y@piware.de/ Link: https://lore.kernel.org/all/35e0670adb4abeab13da2c321582af9f@kernel.org/ Signed-off-by: Tejun Heo Acked-by: Sebastian Andrzej Siewior --- include/linux/cgroup-defs.h | 4 +- kernel/cgroup/cgroup.c | 250 +++++++++++++++++++++----------------------- 2 files changed, 119 insertions(+), 135 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f42563739d2e..50a784da7a81 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -611,8 +611,8 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* used by cgroup_rmdir() to wait for dying tasks to leave */ - wait_queue_head_t dying_populated_waitq; + /* defers killing csses after removal until cgroup is depopulated */ + struct work_struct finish_destroy_work; /* used to schedule release agent */ struct work_struct release_agent_work; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c928dea9dea6..bd10a7e2f9c5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -264,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); +static void cgroup_finish_destroy(struct cgroup *cgrp); +static void kill_css_sync(struct cgroup_subsys_state *css); +static void kill_css_finish(struct cgroup_subsys_state *css); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); -static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -797,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (was_populated == cgroup_is_populated(cgrp)) break; + /* + * Subtree just emptied below an offlined cgrp. Fire deferred + * destroy. The transition is one-shot. + */ + if (was_populated && !css_is_online(&cgrp->self)) { + cgroup_get(cgrp); + WARN_ON_ONCE(!queue_work(cgroup_offline_wq, + &cgrp->finish_destroy_work)); + } + cgroup1_check_for_release(cgrp); TRACE_CGROUP_PATH(notify_populated, cgrp, cgroup_is_populated(cgrp)); @@ -2039,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } +static void cgroup_finish_destroy_work_fn(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work); + + cgroup_lock(); + cgroup_finish_destroy(cgrp); + cgroup_unlock(); + cgroup_put(cgrp); +} + static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -2065,7 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) #endif init_waitqueue_head(&cgrp->offline_waitq); - init_waitqueue_head(&cgrp->dying_populated_waitq); + INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -3375,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { - kill_css(css); + kill_css_sync(css); + kill_css_finish(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) @@ -5514,7 +5537,7 @@ static struct cftype cgroup_psi_files[] = { * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. - * Implemented in kill_css(). + * Implemented in kill_css_finish(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be @@ -5993,7 +6016,7 @@ out_unlock: /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to - * initiate destruction and put the css ref from kill_css(). + * initiate destruction and put the css ref from kill_css_finish(). */ static void css_killed_work_fn(struct work_struct *work) { @@ -6025,15 +6048,12 @@ static void css_killed_ref_fn(struct percpu_ref *ref) } /** - * kill_css - destroy a css - * @css: css to destroy + * kill_css_sync - synchronous half of css teardown + * @css: css being killed * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget_online() is guaranteed to fail and when - * the reference count reaches zero, @css will be released. + * See cgroup_destroy_locked(). */ -static void kill_css(struct cgroup_subsys_state *css) +static void kill_css_sync(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; @@ -6056,24 +6076,6 @@ static void kill_css(struct cgroup_subsys_state *css) */ css_clear_dir(css); - /* - * Killing would put the base ref, but we need to keep it alive - * until after ->css_offline(). - */ - css_get(css); - - /* - * cgroup core guarantees that, by the time ->css_offline() is - * invoked, no new css reference will be given out via - * css_tryget_online(). We can't simply call percpu_ref_kill() and - * proceed to offlining css's because percpu_ref_kill() doesn't - * guarantee that the ref is seen as killed on all CPUs on return. - * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. - */ - percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); - css->cgroup->nr_dying_subsys[ss->id]++; /* * Parent css and cgroup cannot be freed until after the freeing @@ -6086,44 +6088,88 @@ static void kill_css(struct cgroup_subsys_state *css) } /** - * cgroup_destroy_locked - the first stage of cgroup destruction + * kill_css_finish - deferred half of css teardown + * @css: css being killed + * + * See cgroup_destroy_locked(). + */ +static void kill_css_finish(struct cgroup_subsys_state *css) +{ + lockdep_assert_held(&cgroup_mutex); + + /* + * Skip on re-entry: cgroup_apply_control_disable() may have killed @css + * earlier. cgroup_destroy_locked() can still walk it because + * offline_css() (which NULLs cgrp->subsys[ssid]) runs async. + */ + if (percpu_ref_is_dying(&css->refcnt)) + return; + + /* + * Killing would put the base ref, but we need to keep it alive until + * after ->css_offline(). + */ + css_get(css); + + /* + * cgroup core guarantees that, by the time ->css_offline() is invoked, + * no new css reference will be given out via css_tryget_online(). We + * can't simply call percpu_ref_kill() and proceed to offlining css's + * because percpu_ref_kill() doesn't guarantee that the ref is seen as + * killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each css is + * confirmed to be seen as killed on all CPUs. + */ + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); +} + +/** + * cgroup_destroy_locked - destroy @cgrp (called on rmdir) * @cgrp: cgroup to be destroyed * - * css's make use of percpu refcnts whose killing latency shouldn't be - * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget_online() won't succeed by the time - * ->css_offline() is invoked. To satisfy all the requirements, - * destruction is implemented in the following two steps. - * - * s1. Verify @cgrp can be destroyed and mark it dying. Remove all - * userland visible parts and start killing the percpu refcnts of - * css's. Set up so that the next stage will be kicked off once all - * the percpu refcnts are confirmed to be killed. - * - * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the - * rest of destruction. Once all cgroup references are gone, the - * cgroup is RCU-freed. - * - * This function implements s1. After this step, @cgrp is gone as far as - * the userland is concerned and a new cgroup with the same name may be - * created. As cgroup doesn't care about the names internally, this - * doesn't cause any problem. + * Tear down @cgrp on behalf of rmdir. Constraints: + * + * - Userspace: rmdir must succeed when cgroup.procs and friends are empty. + * + * - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's + * subtree is still doing kernel work. A task hidden from cgroup.procs (past + * exit_signals() with signal->live cleared) can still schedule, allocate, and + * consume resources until its final context switch. Dying descendants in the + * subtree can host such tasks too. + * + * - Kernel: css_tryget_online() must fail by the time ->css_offline() runs. + * + * The destruction runs in three parts: + * + * - This function: synchronous user-visible state teardown plus kill_css_sync() + * on each subsystem css. + * + * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on + * each subsystem css. Fires once @cgrp's subtree is fully drained, either + * inline here or from cgroup_update_populated(). + * + * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn -> + * ->css_offline() -> release/free. + * + * Return 0 on success, -EBUSY if a userspace-visible task or an online child + * remains. */ static int cgroup_destroy_locked(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; int ssid, ret; lockdep_assert_held(&cgroup_mutex); - /* - * Only migration can raise populated from zero and we're already - * holding cgroup_mutex. - */ - if (cgroup_is_populated(cgrp)) + css_task_iter_start(&cgrp->self, 0, &it); + task = css_task_iter_next(&it); + css_task_iter_end(&it); + if (task) return -EBUSY; /* @@ -6147,9 +6193,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) link->cset->dead = true; spin_unlock_irq(&css_set_lock); - /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) - kill_css(css); + kill_css_sync(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); @@ -6180,79 +6225,27 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); + if (!cgroup_is_populated(cgrp)) + cgroup_finish_destroy(cgrp); + return 0; }; /** - * cgroup_drain_dying - wait for dying tasks to leave before rmdir - * @cgrp: the cgroup being removed - * - * cgroup.procs and cgroup.threads use css_task_iter which filters out - * PF_EXITING tasks so that userspace doesn't see tasks that have already been - * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the - * cgroup has non-empty css_sets - is only updated when dying tasks pass through - * cgroup_task_dead() in finish_task_switch(). This creates a window where - * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir - * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no - * tasks. - * - * This function aligns cgroup_has_tasks() with what userspace can observe. If - * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are - * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the - * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief. + * cgroup_finish_destroy - deferred half of @cgrp destruction + * @cgrp: cgroup whose subtree just became empty * - * This function only concerns itself with this cgroup's own dying tasks. - * Whether the cgroup has children is cgroup_destroy_locked()'s problem. - * - * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we - * retry the full check from scratch. - * - * Must be called with cgroup_mutex held. + * See cgroup_destroy_locked() for the rationale. */ -static int cgroup_drain_dying(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +static void cgroup_finish_destroy(struct cgroup *cgrp) { - struct css_task_iter it; - struct task_struct *task; - DEFINE_WAIT(wait); + struct cgroup_subsys_state *css; + int ssid; lockdep_assert_held(&cgroup_mutex); -retry: - if (!cgroup_has_tasks(cgrp)) - return 0; - - /* Same iterator as cgroup.threads - if any task is visible, it's busy */ - css_task_iter_start(&cgrp->self, 0, &it); - task = css_task_iter_next(&it); - css_task_iter_end(&it); - - if (task) - return -EBUSY; - /* - * All remaining tasks are PF_EXITING and will pass through - * cgroup_task_dead() shortly. Wait for a kick and retry. - * - * cgroup_has_tasks() can't transition from false to true while we're - * holding cgroup_mutex, but the true to false transition happens - * under css_set_lock (via cgroup_task_dead()). We must retest and - * prepare_to_wait() under css_set_lock. Otherwise, the transition - * can happen between our first test and prepare_to_wait(), and we - * sleep with no one to wake us. - */ - spin_lock_irq(&css_set_lock); - if (!cgroup_has_tasks(cgrp)) { - spin_unlock_irq(&css_set_lock); - return 0; - } - prepare_to_wait(&cgrp->dying_populated_waitq, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); - schedule(); - finish_wait(&cgrp->dying_populated_waitq, &wait); - mutex_lock(&cgroup_mutex); - goto retry; + for_each_css(css, ssid, cgrp) + kill_css_finish(css); } int cgroup_rmdir(struct kernfs_node *kn) @@ -6264,12 +6257,9 @@ int cgroup_rmdir(struct kernfs_node *kn) if (!cgrp) return 0; - ret = cgroup_drain_dying(cgrp); - if (!ret) { - ret = cgroup_destroy_locked(cgrp); - if (!ret) - TRACE_CGROUP_PATH(rmdir, cgrp); - } + ret = cgroup_destroy_locked(cgrp); + if (!ret) + TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; @@ -7029,7 +7019,6 @@ void cgroup_task_exit(struct task_struct *tsk) static void do_cgroup_task_dead(struct task_struct *tsk) { - struct cgrp_cset_link *link; struct css_set *cset; unsigned long flags; @@ -7043,11 +7032,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk) if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) list_add_tail(&tsk->cg_list, &cset->dying_tasks); - /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */ - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) - if (waitqueue_active(&link->cgrp->dying_populated_waitq)) - wake_up(&link->cgrp->dying_populated_waitq); - if (dl_task(tsk)) dec_dl_tasks_cs(tsk); -- cgit v1.2.3 From 60f21a2649308bbd84919ba6656d5ccd660953cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 27 Apr 2026 14:16:34 -1000 Subject: cgroup, sched_ext: Include exiting tasks in cgroup iter a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") made css_task_iter_advance() skip exiting tasks so cgroup.procs stays consistent with waitpid() visibility. Unfortunately, this broke scx_task_iter. scx_task_iter walks either scx_tasks (global) or a cgroup subtree via css_task_iter() and the two modes are expected to cover the same set of tasks. After the above change the cgroup-scoped mode silently skips tasks past exit_signals() that are still on scx_tasks. scx_sub_enable_workfn()'s abort path is one of the symptoms: an exiting SCX_TASK_SUB_INIT task can race past the cgroup iter leaking __scx_init_task() state. Other iterations share the same gap. Add CSS_TASK_ITER_WITH_DEAD to opt out of the skip and use it from scx_task_iter(). Fixes: b0e4c2f8a0f0 ("sched_ext: Implement cgroup subtree iteration for scx_task_iter") Reported-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 8 +++++--- kernel/sched/ext.c | 6 ++++-- 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4..f6d037a30fd8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -53,6 +53,7 @@ struct kernel_clone_args; enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ + CSS_TASK_ITER_WITH_DEAD = (1U << 2), /* include exiting tasks */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1f084ee71443..e51ce4cd3739 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5059,10 +5059,12 @@ repeat: task = list_entry(it->task_pos, struct task_struct, cg_list); /* - * Hide tasks that are exiting but not yet removed. Keep zombie - * leaders with live threads visible. + * Hide tasks that are exiting but not yet removed by default. Keep + * zombie leaders with live threads visible. Usages that need to walk + * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD. */ - if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) + if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) && + (task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) goto repeat; if (it->flags & CSS_TASK_ITER_PROCS) { diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9483be03a4ca..dc5d4787296b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -766,7 +766,8 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); iter->cgrp = cgrp; iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); return; } #endif @@ -866,7 +867,8 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) iter->css_pos = css_next_descendant_pre(iter->css_pos, &iter->cgrp->self); if (iter->css_pos) - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); } return NULL; } -- cgit v1.2.3 From ff9eda4ea906b1f02fc260ddc42d2d9bd736a49c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 27 Apr 2026 14:16:35 -1000 Subject: sched_ext: Skip past-sched_ext_dead() tasks in scx_task_iter_next_locked() scx_task_iter's cgroup-scoped mode can return tasks whose sched_ext_dead() has already completed: cgroup_task_dead() removes from cset->tasks after sched_ext_dead() in finish_task_switch() and is irq-work deferred on PREEMPT_RT. The global mode is fine - sched_ext_dead() removes from scx_tasks via list_del_init() first. Callers (sub-sched enable prep/abort/apply, scx_sub_disable(), scx_fail_parent()) assume returned tasks are still on @sch and trip WARN_ON_ONCE() or operate on torn-down state otherwise. Set %SCX_TASK_OFF_TASKS in sched_ext_dead() under @p's rq lock and have scx_task_iter_next_locked() skip flagged tasks under the same lock. Setter and reader serialize on the per-task rq lock - no race. Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 33 +++++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1a3af2ea2a79..adb9a4de068a 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,6 +101,7 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ + SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* * Bits 8 and 9 are used to carry task state: diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index dc5d4787296b..3f0d8aeaed81 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -928,16 +928,27 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * * Test for idle_sched_class as only init_tasks are on it. */ - if (p->sched_class != &idle_sched_class) - break; - } - if (!p) - return NULL; + if (p->sched_class == &idle_sched_class) + continue; - iter->rq = task_rq_lock(p, &iter->rf); - iter->locked_task = p; + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked_task = p; - return p; + /* + * cgroup_task_dead() removes the dead tasks from cset->tasks + * after sched_ext_dead() and cgroup iteration may see tasks + * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS + * is set by sched_ext_dead() under @p's rq lock. Test it to + * avoid visiting tasks which are already dead from SCX POV. + */ + if (p->scx.flags & SCX_TASK_OFF_TASKS) { + __scx_task_iter_rq_unlock(iter); + continue; + } + + return p; + } + return NULL; } /** @@ -3850,6 +3861,11 @@ void sched_ext_dead(struct task_struct *p) /* * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> * ENABLED transitions can't race us. Disable ops for @p. + * + * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see + * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup + * iteration is only used from sub-sched paths, which require root + * enabled. Root enable transitions every live task to at least READY. */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; @@ -3857,6 +3873,7 @@ void sched_ext_dead(struct task_struct *p) rq = task_rq_lock(p, &rf); scx_disable_and_exit_task(scx_task_sched(p), p); + p->scx.flags |= SCX_TASK_OFF_TASKS; task_rq_unlock(rq, p, &rf); } } -- cgit v1.2.3 From 8f78b749f3da0f43990490b4c1193b5ede3eec0a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2026 10:44:20 +0300 Subject: sched/isolation: Make HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN Since commit 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management"), kthreads default to use the HK_TYPE_DOMAIN cpumask. IOW, it is no longer affected by the setting of the nohz_full boot kernel parameter. That means HK_TYPE_KTHREAD should now be an alias of HK_TYPE_DOMAIN instead of HK_TYPE_KERNEL_NOISE to correctly reflect the current kthread behavior. Make the change as HK_TYPE_KTHREAD is still being used in some networking code. Fixes: 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management") Signed-off-by: Waiman Long Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/linux/sched/isolation.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index dc3975ff1b2e..cf0fd03dd7a2 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -20,6 +20,11 @@ enum hk_type { HK_TYPE_KERNEL_NOISE, HK_TYPE_MAX, + /* + * HK_TYPE_KTHREAD is now an alias of HK_TYPE_DOMAIN + */ + HK_TYPE_KTHREAD = HK_TYPE_DOMAIN, + /* * The following housekeeping types are only set by the nohz_full * boot commandline option. So they can share the same value. @@ -29,7 +34,6 @@ enum hk_type { HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, - HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION -- cgit v1.2.3