From 4314a44564eb1565349fed7a4192344c5f46fc85 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Wed, 6 May 2026 17:47:12 +0800 Subject: bpf: Fix out-of-bounds read in bpf_patch_call_args() The interpreters_args array only accommodates stack depths up to MAX_BPF_STACK (512 bytes). However, do_misc_fixups() may allow a larger stack depth if JIT is requested. If JIT compilation later fails and falls back to the interpreter, the verifier invokes bpf_patch_call_args() with this oversized stack depth. This causes a load-time out-of-bounds (OOB) read when calculating the interpreter function pointer index. Fix this by changing bpf_patch_call_args() to return an int and explicitly rejecting the JIT fallback (returning -EINVAL) if the stack depth exceeds MAX_BPF_STACK. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Signed-off-by: Yazhou Tang Acked-by: Xu Kuohai Link: https://lore.kernel.org/r/20260506094714.419842-2-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 01e203964892..52b30e9ea431 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2917,7 +2917,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON -void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); #endif struct btf *bpf_get_btf_vmlinux(void); -- cgit v1.2.3 From 58a8f3e2501dc14b8e00e883d6aaf0600a239da7 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Wed, 6 May 2026 17:47:13 +0800 Subject: bpf: Fix s16 truncation for large bpf-to-bpf call offsets Currently, the BPF instruction set allows bpf-to-bpf calls (or internal calls, pseudo calls) to use a 32-bit imm field to represent the relative jump offset. However, when JIT is disabled or falls back to the interpreter, the verifier invokes bpf_patch_call_args() to rewrite the call instruction. In this function, the 32-bit imm is downcast to s16 and stored in the off field. void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); insn->off = (s16) insn->imm; insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } If the original imm exceeds the s16 range (i.e., a jump offset greater than 32767 instructions), this downcast silently truncates the offset, resulting in an incorrect call target. Fix this by: 1. In bpf_patch_call_args(), keeping the imm field unchanged and using the off field to store the index of the interpreter function. 2. In ___bpf_prog_run() for the JMP_CALL_ARGS case, retrieving the interpreter function pointer from the interpreters_args array using the off field as the index, and passing the original imm to calculate the last argument of the interpreter function. After these changes, the truncation issue is resolved, and __bpf_call_base_args is also no longer needed and can be removed, which makes the code cleaner. Performance: In ___bpf_prog_run() for the JMP_CALL_ARGS case, changing the retrieval of the interpreter function pointer from pointer addition to direct array indexing improves performance. The possible reason is that the latter has better instruction-level parallelism. See the v5 discussion [1] for more details. [1] https://lore.kernel.org/bpf/f120c3c4-6999-414a-b514-518bb64b4758@zju.edu.cn/ To avoid requiring bpftool changes, keep the new imm/off encoding internal and restore the legacy xlated dump layout in bpf_insn_prepare_dump(). For bpf-to-bpf call offsets that do not fit in s16, export off as 0 instead of a truncated and misleading value. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Fixes: 7105e828c087 ("bpf: allow for correlation of maps and helpers in dump") Suggested-by: Xu Kuohai Suggested-by: Puranjay Mohan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Signed-off-by: Yazhou Tang Link: https://lore.kernel.org/r/20260506094714.419842-3-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ include/linux/filter.h | 3 --- kernel/bpf/core.c | 21 ++++++++++++++------- kernel/bpf/fixups.c | 6 +++--- kernel/bpf/syscall.c | 26 ++++++++++++++++++++++++++ 5 files changed, 49 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 52b30e9ea431..cd191c5fdb0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2918,6 +2918,12 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 ua #ifndef CONFIG_BPF_JIT_ALWAYS_ON int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +s32 bpf_call_args_imm(s16 idx); +#else +static inline s32 bpf_call_args_imm(s16 idx) +{ + return 0; +} #endif struct btf *bpf_get_btf_vmlinux(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index 1ec6d5ba64cc..88a241aac36a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1151,9 +1151,6 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -#define __bpf_call_base_args \ - ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ - (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 63044ebe5721..6aa2a8b24030 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1771,6 +1771,9 @@ static u32 abs_s32(s32 x) return x >= 0 ? (u32)x : -(u32)x; } +static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn); + /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -2077,10 +2080,9 @@ select_insn: CONT; JMP_CALL_ARGS: - BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, - BPF_R3, BPF_R4, - BPF_R5, - insn + insn->off + 1); + BPF_R0 = interpreters_args[insn->off](BPF_R1, BPF_R2, BPF_R3, + BPF_R4, BPF_R5, + insn + insn->imm + 1); CONT; JMP_TAIL_CALL: { @@ -2400,12 +2402,17 @@ int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) /* Prevent out-of-bounds read to interpreters_args */ if (stack_depth > MAX_BPF_STACK) return -EINVAL; - insn->off = (s16) insn->imm; - insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - - __bpf_call_base_args; + insn->off = (round_up(stack_depth, 32) / 32) - 1; insn->code = BPF_JMP | BPF_CALL_ARGS; return 0; } + +s32 bpf_call_args_imm(s16 idx) +{ + if (WARN_ON_ONCE(idx < 0 || idx >= ARRAY_SIZE(interpreters_args))) + return 0; + return BPF_CALL_IMM(interpreters_args[idx]); +} #endif #endif diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index df8f48091321..3692adf62558 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1250,9 +1250,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) } if (!bpf_pseudo_call(insn)) continue; - insn->off = env->insn_aux_data[i].call_imm; - subprog = bpf_find_subprog(env, i + insn->off + 1); - insn->imm = subprog; + insn->imm = env->insn_aux_data[i].call_imm; + subprog = bpf_find_subprog(env, i + insn->imm + 1); + insn->off = subprog; } prog->jited = 1; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3c0214ca934..630d530782fe 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4919,6 +4919,29 @@ out: return map; } +static void prepare_dump_pseudo_call(struct bpf_insn *insn) +{ + s32 call_off = insn->imm; + + /* + * BPF_CALL_ARGS only exists for interpreter fallback. + * 1. For interpreter (BPF_CALL_ARGS): insn->off is the index of + * interpreters_args array, so here using bpf_call_args_imm() + * to get the real address offset. + * 2. For JIT (BPF_CALL): insn->off is the subprog id. + */ + if (insn->code == (BPF_JMP | BPF_CALL_ARGS)) + insn->imm = bpf_call_args_imm(insn->off); + else + insn->imm = insn->off; + + /* Avoid dumping a truncated and misleading pc-relative offset. */ + if (call_off > S16_MAX || call_off < S16_MIN) + insn->off = 0; + else + insn->off = call_off; +} + static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, const struct cred *f_cred) { @@ -4944,6 +4967,9 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, } if (code == (BPF_JMP | BPF_CALL) || code == (BPF_JMP | BPF_CALL_ARGS)) { + /* Restore the legacy xlated dump layout. */ + if (insns[i].src_reg == BPF_PSEUDO_CALL) + prepare_dump_pseudo_call(&insns[i]); if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; if (!bpf_dump_raw_ok(f_cred)) -- cgit v1.2.3 From b5782e2d462c028096f922abca46318cec890670 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:40 +0100 Subject: netfs: Fix missing barriers when accessing stream->subrequests locklessly The list of subrequests attached to stream->subrequests is accessed without locks by netfs_collect_read_results() and netfs_collect_write_results(), and then they access subreq->flags without taking a barrier after getting the subreq pointer from the list. Relatedly, the functions that build the list don't use any sort of write barrier when constructing the list to make sure that the NETFS_SREQ_IN_PROGRESS flag is perceived to be set first if no lock is taken. Fix this by: (1) Add a new list_add_tail_release() function that uses a release barrier to set the pointer to the new member of the list. (2) Add a new list_first_entry_or_null_acquire() function that uses an acquire barrier to read the pointer to the first member in a list (or return NULL). (3) Use list_add_tail_release() when adding a subreq to ->subrequests. (4) Use list_first_entry_or_null_acquire() when initially accessing the front of the list (when an item is removed, the pointer to the new front iterm is obtained under the same lock). Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Link: https://sashiko.dev/#/patchset/20260326104544.509518-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-4-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 3 ++- fs/netfs/misc.c | 1 + fs/netfs/read_collect.c | 6 ++++-- fs/netfs/write_collect.c | 6 ++++-- fs/netfs/write_issue.c | 3 ++- include/linux/list.h | 37 +++++++++++++++++++++++++++++++++++++ 6 files changed, 50 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a27ed501b6d4..15d73026ff64 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -168,7 +168,8 @@ void netfs_queue_read(struct netfs_io_request *rreq, * remove entries off of the front. */ spin_lock(&rreq->lock); - list_add_tail(&subreq->rreq_link, &stream->subrequests); + /* Write IN_PROGRESS before pointer to new subreq */ + list_add_tail_release(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { if (!stream->active) { stream->collected_to = subreq->start; diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 6df89c92b10b..21357907b7ee 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -356,6 +356,7 @@ void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, DEFINE_WAIT(myself); list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + smp_rmb(); /* Read ->next before IN_PROGRESS. */ if (!netfs_check_subreq_in_progress(subreq)) continue; diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index d2d902f46627..3c9b847885c2 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -205,8 +205,10 @@ reassess: * in progress. The issuer thread may be adding stuff to the tail * whilst we're doing this. */ - front = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); + front = list_first_entry_or_null_acquire(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + /* Read first subreq pointer before IN_PROGRESS flag. */ + while (front) { size_t transferred; diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index b194447f4b11..7fbf50907a7f 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -228,8 +228,10 @@ reassess_streams: if (!smp_load_acquire(&stream->active)) continue; - front = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); + front = list_first_entry_or_null_acquire(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + /* Read first subreq pointer before IN_PROGRESS flag. */ + while (front) { trace_netfs_collect_sreq(wreq, front); //_debug("sreq [%x] %llx %zx/%zx", diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 2db688f94125..b0e9690bb90c 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -204,7 +204,8 @@ void netfs_prepare_write(struct netfs_io_request *wreq, * remove entries off of the front. */ spin_lock(&wreq->lock); - list_add_tail(&subreq->rreq_link, &stream->subrequests); + /* Write IN_PROGRESS before pointer to new subreq */ + list_add_tail_release(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { if (!stream->active) { stream->collected_to = subreq->start; diff --git a/include/linux/list.h b/include/linux/list.h index 00ea8e5fb88b..09d979976b3b 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -191,6 +191,29 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head) __list_add(new, head->prev, head); } +/** + * list_add_tail_release - add a new entry with release barrier + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head, using a release barrier to set + * the ->next pointer that points to it. This is useful for implementing + * queues, in particular one that the elements will be walked through forwards + * locklessly. + */ +static inline void list_add_tail_release(struct list_head *new, + struct list_head *head) +{ + struct list_head *prev = head->prev; + + if (__list_add_valid(new, prev, head)) { + new->next = head; + new->prev = prev; + head->prev = new; + smp_store_release(&prev->next, new); + } +} + /* * Delete a list entry by making the prev/next entries * point to each other. @@ -644,6 +667,20 @@ static inline void list_splice_tail_init(struct list_head *list, pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) +/** + * list_first_entry_or_null_acquire - get the first element from a list with barrier + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null_acquire(ptr, type, member) ({ \ + struct list_head *head__ = (ptr); \ + struct list_head *pos__ = smp_load_acquire(&head__->next); \ + pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ +}) + /** * list_last_entry_or_null - get the last element from a list * @ptr: the list head to take the element from. -- cgit v1.2.3 From 2c8f4742bb76117d735f92a3932d85239b16c494 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:42 +0100 Subject: netfs: Fix potential for tearing in ->remote_i_size and ->zero_point Fix potential tearing in using ->remote_i_size and ->zero_point by copying i_size_read() and i_size_write() and using the same seqcount as for i_size. We need to make sure that netfslib and the filesystems that use it always hold i_lock whilst updating any of the sizes to prevent i_size_seqcount from getting corrupted. Fixes: 4058f742105e ("netfs: Keep track of the actual remote file size") Fixes: 100ccd18bb41 ("netfs: Optimise away reads above the point at which there can be no data") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-6-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/9p/v9fs_vfs.h | 13 -- fs/9p/vfs_inode.c | 6 +- fs/9p/vfs_inode_dotl.c | 12 +- fs/afs/file.c | 24 +++- fs/afs/inode.c | 31 +++-- fs/afs/internal.h | 11 +- fs/afs/write.c | 2 +- fs/netfs/buffered_read.c | 6 +- fs/netfs/buffered_write.c | 2 +- fs/netfs/direct_write.c | 6 +- fs/netfs/misc.c | 32 +++-- fs/netfs/write_collect.c | 9 +- fs/smb/client/cifsfs.c | 38 ++++-- fs/smb/client/cifssmb.c | 3 +- fs/smb/client/file.c | 13 +- fs/smb/client/inode.c | 14 ++- fs/smb/client/readdir.c | 3 +- fs/smb/client/smb2ops.c | 42 ++++--- fs/smb/client/smb2pdu.c | 3 +- include/linux/netfs.h | 293 ++++++++++++++++++++++++++++++++++++++++++++-- 20 files changed, 450 insertions(+), 113 deletions(-) (limited to 'include') diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index d3aefbec4de6..34c115d7c250 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -75,17 +75,4 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) int v9fs_open_to_dotl_flags(int flags); -static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size) -{ - /* - * 32-bit need the lock, concurrent updates could break the - * sequences and make i_size_read() loop forever. - * 64-bit updates are atomic and can skip the locking. - */ - if (sizeof(i_size) > sizeof(long)) - spin_lock(&inode->i_lock); - i_size_write(inode, i_size); - if (sizeof(i_size) > sizeof(long)) - spin_unlock(&inode->i_lock); -} #endif diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index d1508b1fe109..f468acb8ee7d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1141,11 +1141,13 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - v9inode->netfs.remote_i_size = stat->length; + spin_lock(&inode->i_lock); + netfs_write_remote_i_size(inode, stat->length); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) - v9fs_i_size_write(inode, stat->length); + i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ inode->i_blocks = (stat->length + 512 - 1) >> 9; + spin_unlock(&inode->i_lock); v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 71796a89bcf4..141fb54db65d 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -634,10 +634,12 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - v9inode->netfs.remote_i_size = stat->st_size; + spin_lock(&inode->i_lock); + netfs_write_remote_i_size(inode, stat->st_size); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) - v9fs_i_size_write(inode, stat->st_size); + i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; + spin_unlock(&inode->i_lock); } else { if (stat->st_result_mask & P9_STATS_ATIME) { inode_set_atime(inode, stat->st_atime_sec, @@ -662,13 +664,15 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; } + spin_lock(&inode->i_lock); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && stat->st_result_mask & P9_STATS_SIZE) { - v9inode->netfs.remote_i_size = stat->st_size; - v9fs_i_size_write(inode, stat->st_size); + netfs_write_remote_i_size(inode, stat->st_size); + i_size_write(inode, stat->st_size); } if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; + spin_unlock(&inode->i_lock); } if (stat->st_result_mask & P9_STATS_GEN) inode->i_generation = stat->st_gen; diff --git a/fs/afs/file.c b/fs/afs/file.c index 85696ac984cc..0467742bfeee 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -427,21 +427,35 @@ static void afs_free_request(struct netfs_io_request *rreq) afs_put_wb_key(rreq->netfs_priv2); } -static void afs_update_i_size(struct inode *inode, loff_t new_i_size) +/* + * Set the file size and block count, taking ->cb_lock and ->i_lock to maintain + * coherency and prevent 64-bit tearing on 32-bit arches. + * + * Also, estimate the number of 512 bytes blocks used, rounded up to nearest 1K + * for consistency with other AFS clients. + */ +void afs_set_i_size(struct afs_vnode *vnode, loff_t new_i_size) { - struct afs_vnode *vnode = AFS_FS_I(inode); + struct inode *inode = &vnode->netfs.inode; loff_t i_size; write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->netfs.inode); + spin_lock(&inode->i_lock); + i_size = i_size_read(inode); if (new_i_size > i_size) { - i_size_write(&vnode->netfs.inode, new_i_size); - inode_set_bytes(&vnode->netfs.inode, new_i_size); + i_size_write(inode, new_i_size); + inode_set_bytes(inode, round_up(new_i_size, 1024)); } + spin_unlock(&inode->i_lock); write_sequnlock(&vnode->cb_lock); fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size); } +static void afs_update_i_size(struct inode *inode, loff_t new_i_size) +{ + afs_set_i_size(AFS_FS_I(inode), new_i_size); +} + static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq) { struct afs_vnode *vnode = AFS_FS_I(wreq->inode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index a5173434f786..19fe2e392885 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -224,7 +224,8 @@ static int afs_inode_init_from_status(struct afs_operation *op, return afs_protocol_error(NULL, afs_eproto_file_type); } - afs_set_i_size(vnode, status->size); + i_size_write(inode, status->size); + inode_set_bytes(inode, status->size); afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; @@ -253,7 +254,8 @@ static void afs_apply_status(struct afs_operation *op, { struct afs_file_status *status = &vp->scb.status; struct afs_vnode *vnode = vp->vnode; - struct inode *inode = &vnode->netfs.inode; + struct netfs_inode *ictx = &vnode->netfs; + struct inode *inode = &ictx->inode; struct timespec64 t; umode_t mode; bool unexpected_jump = false; @@ -336,6 +338,8 @@ static void afs_apply_status(struct afs_operation *op, } if (data_changed) { + unsigned long long zero_point, size = status->size; + inode_set_iversion_raw(inode, status->data_version); /* Only update the size if the data version jumped. If the @@ -343,16 +347,25 @@ static void afs_apply_status(struct afs_operation *op, * idea of what the size should be that's not the same as * what's on the server. */ - vnode->netfs.remote_i_size = status->size; - if (change_size || status->size > i_size_read(inode)) { - afs_set_i_size(vnode, status->size); + spin_lock(&inode->i_lock); + + if (change_size || size > i_size_read(inode)) { + /* We can read the sizes directly as we hold i_lock. */ + zero_point = ictx->_zero_point; + if (unexpected_jump) - vnode->netfs.zero_point = status->size; + zero_point = size; + netfs_write_sizes(inode, size, size, zero_point); + inode_set_bytes(inode, size); inode_set_ctime_to_ts(inode, t); inode_set_atime_to_ts(inode, t); + } else { + netfs_write_remote_i_size(inode, size); } + spin_unlock(&inode->i_lock); + if (op->ops == &afs_fetch_data_operation) - op->fetch.subreq->rreq->i_size = status->size; + op->fetch.subreq->rreq->i_size = size; } } @@ -709,7 +722,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, * it, but we need to give userspace the server's size. */ if (S_ISDIR(inode->i_mode)) - stat->size = vnode->netfs.remote_i_size; + stat->size = netfs_read_remote_i_size(inode); } while (read_seqretry(&vnode->cb_lock, seq)); return 0; @@ -889,7 +902,7 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && attr->ia_size < i_size && - attr->ia_size > vnode->netfs.remote_i_size) { + attr->ia_size > netfs_read_remote_i_size(inode)) { truncate_setsize(inode, attr->ia_size); netfs_resize_file(&vnode->netfs, size, false); fscache_resize_cookie(afs_vnode_cache(vnode), diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 599353c33337..816dc848ea71 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1157,6 +1157,7 @@ extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); void afs_fetch_data_async_rx(struct work_struct *work); void afs_fetch_data_immediate_cancel(struct afs_call *call); +void afs_set_i_size(struct afs_vnode *vnode, loff_t new_i_size); /* * flock.c @@ -1758,16 +1759,6 @@ static inline void afs_update_dentry_version(struct afs_operation *op, (void *)(unsigned long)dir_vp->scb.status.data_version; } -/* - * Set the file size and block count. Estimate the number of 512 bytes blocks - * used, rounded up to nearest 1K for consistency with other AFS clients. - */ -static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) -{ - i_size_write(&vnode->netfs.inode, size); - vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; -} - /* * Check for a conflicting operation on a directory that we just unlinked from. * If someone managed to sneak a link or an unlink in on the file we just diff --git a/fs/afs/write.c b/fs/afs/write.c index fcfed9d24e0a..7f34b939706a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -142,7 +142,7 @@ static void afs_issue_write_worker(struct work_struct *work) afs_begin_vnode_operation(op); op->store.write_iter = &subreq->io_iter; - op->store.i_size = umax(pos + len, vnode->netfs.remote_i_size); + op->store.i_size = umax(pos + len, netfs_read_remote_i_size(&vnode->netfs.inode)); op->mtime = inode_get_mtime(&vnode->netfs.inode); afs_wait_for_operation(op); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index fee0aebf5a3d..ebd84a6cc3f0 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -209,7 +209,6 @@ static void netfs_issue_read(struct netfs_io_request *rreq, static void netfs_read_to_pagecache(struct netfs_io_request *rreq, struct readahead_control *ractl) { - struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned long long start = rreq->start; ssize_t size = rreq->len; int ret = 0; @@ -233,7 +232,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq, source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size); subreq->source = source; if (source == NETFS_DOWNLOAD_FROM_SERVER) { - unsigned long long zp = umin(ictx->zero_point, rreq->i_size); + unsigned long long zero_point = netfs_read_zero_point(rreq->inode); + unsigned long long zp = umin(zero_point, rreq->i_size); size_t len = subreq->len; if (unlikely(rreq->origin == NETFS_READ_SINGLE)) @@ -249,7 +249,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq, pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx", rreq->debug_id, subreq->debug_index, subreq->len, size, - subreq->start, ictx->zero_point, rreq->i_size); + subreq->start, zero_point, rreq->i_size); netfs_cancel_read(subreq, ret); break; } diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 05ea5b0cc0e8..b6ecd059dc4f 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -230,7 +230,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, * server would just return a block of zeros or a short read if * we try to read it. */ - if (fpos >= ctx->zero_point) { + if (fpos >= netfs_read_zero_point(inode)) { folio_zero_segment(folio, 0, offset); copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index f9ab69de3e29..25f8ceb15fad 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -376,8 +376,10 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret < 0) goto out; end = iocb->ki_pos + iov_iter_count(from); - if (end > ictx->zero_point) - ictx->zero_point = end; + spin_lock(&inode->i_lock); + if (end > ictx->_zero_point) + netfs_write_zero_point(inode, end); + spin_unlock(&inode->i_lock); fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode), FSCACHE_INVAL_DIO_WRITE); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 21357907b7ee..bad661ff2bec 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -211,18 +211,25 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct netfs_folio *finfo; - struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); + struct inode *inode = folio_inode(folio); + struct netfs_inode *ctx = netfs_inode(inode); size_t flen = folio_size(folio); _enter("{%lx},%zx,%zx", folio->index, offset, length); if (offset == 0 && length == flen) { - unsigned long long i_size = i_size_read(&ctx->inode); + unsigned long long i_size, remote_i_size, zero_point; unsigned long long fpos = folio_pos(folio), end; + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); end = umin(fpos + flen, i_size); - if (fpos < i_size && end > ctx->zero_point) - ctx->zero_point = end; + if (fpos < i_size && end > zero_point) { + spin_lock(&inode->i_lock); + end = umin(fpos + flen, inode->i_size); + if (fpos < i_size && end > ctx->_zero_point) + netfs_write_zero_point(inode, end); + spin_unlock(&inode->i_lock); + } } folio_wait_private_2(folio); /* [DEPRECATED] */ @@ -292,15 +299,22 @@ EXPORT_SYMBOL(netfs_invalidate_folio); */ bool netfs_release_folio(struct folio *folio, gfp_t gfp) { - struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); - unsigned long long end; + struct inode *inode = folio_inode(folio); + struct netfs_inode *ctx = netfs_inode(inode); + unsigned long long i_size, remote_i_size, zero_point, end; if (folio_test_dirty(folio)) return false; - end = umin(folio_next_pos(folio), i_size_read(&ctx->inode)); - if (end > ctx->zero_point) - ctx->zero_point = end; + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); + end = umin(folio_next_pos(folio), i_size); + if (end > zero_point) { + spin_lock(&inode->i_lock); + end = umin(folio_next_pos(folio), inode->i_size); + if (end > ctx->_zero_point) + netfs_write_zero_point(inode, end); + spin_unlock(&inode->i_lock); + } if (folio_test_private(folio)) return false; diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 7fbf50907a7f..24fc2bb2f8a4 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -57,7 +57,8 @@ static void netfs_dump_request(const struct netfs_io_request *rreq) int netfs_folio_written_back(struct folio *folio) { enum netfs_folio_trace why = netfs_folio_trace_clear; - struct netfs_inode *ictx = netfs_inode(folio->mapping->host); + struct inode *inode = folio_inode(folio); + struct netfs_inode *ictx = netfs_inode(inode); struct netfs_folio *finfo; struct netfs_group *group = NULL; int gcount = 0; @@ -69,8 +70,10 @@ int netfs_folio_written_back(struct folio *folio) unsigned long long fend; fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len; - if (fend > ictx->zero_point) - ictx->zero_point = fend; + spin_lock(&ictx->inode.i_lock); + if (fend > ictx->_zero_point) + netfs_write_zero_point(inode, fend); + spin_unlock(&ictx->inode.i_lock); folio_detach_private(folio); group = finfo->netfs_group; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 9f76b0347fa9..feac491c5070 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -434,7 +434,8 @@ cifs_alloc_inode(struct super_block *sb) spin_lock_init(&cifs_inode->writers_lock); cifs_inode->writers = 0; cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ - cifs_inode->netfs.remote_i_size = 0; + cifs_inode->netfs._remote_i_size = 0; + cifs_inode->netfs._zero_point = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; cifs_inode->epoch = 0; @@ -1303,7 +1304,8 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, struct cifsFileInfo *smb_file_src = src_file->private_data; struct cifsFileInfo *smb_file_target = dst_file->private_data; struct cifs_tcon *target_tcon, *src_tcon; - unsigned long long destend, fstart, fend, old_size, new_size; + unsigned long long i_size, old_size, new_size, zero_point; + unsigned long long destend, fstart, fend; unsigned int xid; int rc; @@ -1347,7 +1349,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, * Advance the EOF marker after the flush above to the end of the range * if it's short of that. */ - if (src_cifsi->netfs.remote_i_size < off + len) { + if (netfs_read_remote_i_size(src_inode) < off + len) { rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); if (rc < 0) goto unlock; @@ -1368,16 +1370,18 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false); if (rc) goto unlock; - if (fend > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = fend + 1; - old_size = target_cifsi->netfs.remote_i_size; + + spin_lock(&target_inode->i_lock); + if (fend > zero_point) + netfs_write_zero_point(target_inode, fend + 1); + i_size = target_inode->i_size; + spin_unlock(&target_inode->i_lock); /* Discard all the folios that overlap the destination region. */ cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend); truncate_inode_pages_range(&target_inode->i_data, fstart, fend); - fscache_invalidate(cifs_inode_cookie(target_inode), NULL, - i_size_read(target_inode), 0); + fscache_invalidate(cifs_inode_cookie(target_inode), NULL, i_size, 0); rc = -EOPNOTSUPP; if (target_tcon->ses->server->ops->duplicate_extents) { @@ -1402,8 +1406,12 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, rc = -EINVAL; } } - if (rc == 0 && new_size > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = new_size; + if (rc == 0) { + spin_lock(&target_inode->i_lock); + if (new_size > target_cifsi->netfs._zero_point) + netfs_write_zero_point(target_inode, new_size); + spin_unlock(&target_inode->i_lock); + } } /* force revalidate of size and timestamps of target file now @@ -1474,7 +1482,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, * Advance the EOF marker after the flush above to the end of the range * if it's short of that. */ - if (src_cifsi->netfs.remote_i_size < off + len) { + if (netfs_read_remote_i_size(src_inode) < off + len) { rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); if (rc < 0) goto unlock; @@ -1502,8 +1510,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, fscache_resize_cookie(cifs_inode_cookie(target_inode), i_size_read(target_inode)); } - if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point) - target_cifsi->netfs.zero_point = destoff + rc; + if (rc > 0) { + spin_lock(&target_inode->i_lock); + if (destoff + rc > target_cifsi->netfs._zero_point) + netfs_write_zero_point(target_inode, destoff + rc); + spin_unlock(&target_inode->i_lock); + } } file_accessed(src_file); diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 3990a9012264..9e27bfa7376b 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1465,6 +1465,7 @@ cifs_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid) struct cifs_io_subrequest *rdata = mid->callback_data; struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); + struct inode *inode = &ictx->inode; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 1, .rq_iter = rdata->subreq.io_iter }; @@ -1538,7 +1539,7 @@ do_retry: } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && - rdata->subreq.start + trans >= ictx->remote_i_size) { + rdata->subreq.start + trans >= netfs_read_remote_i_size(inode)) { rdata->result = 0; __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); } else if (rdata->got_bytes > 0) { diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 664a2c223089..b60344125f27 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -2517,18 +2517,23 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result) { struct netfs_io_request *wreq = wdata->rreq; - struct netfs_inode *ictx = netfs_inode(wreq->inode); + struct inode *inode = wreq->inode; + struct netfs_inode *ictx = netfs_inode(inode); loff_t wrend; if (result > 0) { + spin_lock(&inode->i_lock); + wrend = wdata->subreq.start + wdata->subreq.transferred + result; - if (wrend > ictx->zero_point && + if (wrend > ictx->_zero_point && (wdata->rreq->origin == NETFS_UNBUFFERED_WRITE || wdata->rreq->origin == NETFS_DIO_WRITE)) - ictx->zero_point = wrend; - if (wrend > ictx->remote_i_size) + netfs_write_zero_point(inode, wrend); + if (wrend > ictx->_remote_i_size) netfs_resize_file(ictx, wrend, true); + + spin_unlock(&inode->i_lock); } netfs_write_subrequest_terminated(&wdata->subreq, result); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 16a5310155d5..9472c0a6c187 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -119,7 +119,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); mtime = inode_get_mtime(inode); if (timespec64_equal(&mtime, &fattr->cf_mtime) && - cifs_i->netfs.remote_i_size == fattr->cf_eof) { + netfs_read_remote_i_size(inode) == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); return; @@ -173,12 +173,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, CIFS_I(inode)->time = 0; /* force reval */ return -ESTALE; } - if (inode_state_read_once(inode) & I_NEW) - CIFS_I(inode)->netfs.zero_point = fattr->cf_eof; - cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); + if (inode_state_read_once(inode) & I_NEW) + netfs_write_zero_point(inode, fattr->cf_eof); + fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); fattr->cf_atime = timestamp_truncate(fattr->cf_atime, inode); fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode); @@ -212,7 +212,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, else clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); - cifs_i->netfs.remote_i_size = fattr->cf_eof; + netfs_write_remote_i_size(inode, fattr->cf_eof); /* * Can't safely change the file size here if the client is writing to * it due to potential races. @@ -2772,7 +2772,9 @@ cifs_revalidate_mapping(struct inode *inode) if (cifs_sb_flags(cifs_sb) & CIFS_MOUNT_RW_CACHE) goto skip_invalidate; - cifs_inode->netfs.zero_point = cifs_inode->netfs.remote_i_size; + spin_lock(&inode->i_lock); + netfs_write_zero_point(inode, netfs_inode(inode)->_remote_i_size); + spin_unlock(&inode->i_lock); rc = filemap_invalidate_inode(inode, true, 0, LLONG_MAX); if (rc) { cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n", diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index be22bbc4a65a..e860fa08b5e3 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -143,7 +143,8 @@ retry: fattr->cf_rdev = inode->i_rdev; fattr->cf_uid = inode->i_uid; fattr->cf_gid = inode->i_gid; - fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size; + fattr->cf_eof = + netfs_read_remote_i_size(inode); fattr->cf_symlink_target = NULL; } else { CIFS_I(inode)->time = 0; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index e6cb9b144530..0ea3ce1b94ea 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -3402,8 +3402,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; - struct netfs_inode *ictx = netfs_inode(inode); - unsigned long long i_size, new_size, remote_size; + unsigned long long i_size, new_size, remote_i_size, zero_point; long rc; unsigned int xid; @@ -3414,9 +3413,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, filemap_invalidate_lock(inode->i_mapping); - i_size = i_size_read(inode); - remote_size = ictx->remote_i_size; - if (offset + len >= remote_size && offset < i_size) { + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); + if (offset + len >= remote_i_size && offset < i_size) { unsigned long long top = umin(offset + len, i_size); rc = filemap_write_and_wait_range(inode->i_mapping, offset, top - 1); @@ -3449,9 +3447,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, new_size); if (rc >= 0) { truncate_setsize(inode, new_size); + spin_lock(&inode->i_lock); netfs_resize_file(&cifsi->netfs, new_size, true); - if (offset < cifsi->netfs.zero_point) - cifsi->netfs.zero_point = offset; + if (offset < cifsi->netfs._zero_point) + netfs_write_zero_point(inode, offset); + spin_unlock(&inode->i_lock); fscache_resize_cookie(cifs_inode_cookie(inode), new_size); } } @@ -3474,7 +3474,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; - unsigned long long end = offset + len, i_size, remote_i_size; + unsigned long long end = offset + len, i_size, remote_i_size, zero_point; long rc; unsigned int xid; __u8 set_sparse = 1; @@ -3516,14 +3516,17 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, * that we locally hole-punch the tail of the dirty data, the proposed * EOF update will end up in the wrong place. */ - i_size = i_size_read(inode); - remote_i_size = netfs_inode(inode)->remote_i_size; + netfs_read_sizes(inode, &i_size, &remote_i_size, &zero_point); + if (end > remote_i_size && i_size > remote_i_size) { unsigned long long extend_to = umin(end, i_size); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, extend_to); - if (rc >= 0) - netfs_inode(inode)->remote_i_size = extend_to; + if (rc >= 0) { + spin_lock(&inode->i_lock); + netfs_write_remote_i_size(inode, extend_to); + spin_unlock(&inode->i_lock); + } } unlock: @@ -3787,7 +3790,6 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; - struct netfs_inode *ictx = &cifsi->netfs; loff_t old_eof, new_eof; xid = get_xid(); @@ -3805,7 +3807,9 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_pagecache_range(inode, off, old_eof); - ictx->zero_point = old_eof; + spin_lock(&inode->i_lock); + netfs_write_zero_point(inode, old_eof); + spin_unlock(&inode->i_lock); netfs_wait_for_outstanding_io(inode); rc = smb2_copychunk_range(xid, cfile, cfile, off + len, @@ -3822,8 +3826,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, rc = 0; truncate_setsize(inode, new_eof); + spin_lock(&inode->i_lock); netfs_resize_file(&cifsi->netfs, new_eof, true); - ictx->zero_point = new_eof; + netfs_write_zero_point(inode, new_eof); + spin_unlock(&inode->i_lock); fscache_resize_cookie(cifs_inode_cookie(inode), new_eof); out_2: filemap_invalidate_unlock(inode->i_mapping); @@ -3866,13 +3872,17 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_setsize(inode, new_eof); + spin_lock(&inode->i_lock); netfs_resize_file(&cifsi->netfs, i_size_read(inode), true); + spin_unlock(&inode->i_lock); fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode)); rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); if (rc < 0) goto out_2; - cifsi->netfs.zero_point = new_eof; + spin_lock(&inode->i_lock); + netfs_write_zero_point(inode, new_eof); + spin_unlock(&inode->i_lock); rc = smb3_zero_data(file, tcon, off, len, xid); if (rc < 0) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 995fcdd30681..3bd300347f16 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4608,6 +4608,7 @@ smb2_readv_callback(struct TCP_Server_Info *server, struct mid_q_entry *mid) struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode); struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink); struct smb2_hdr *shdr = (struct smb2_hdr *)rdata->iov[0].iov_base; + struct inode *inode = &ictx->inode; struct cifs_credits credits = { .value = 0, .instance = 0, @@ -4721,7 +4722,7 @@ do_retry: } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && - rdata->subreq.start + trans >= ictx->remote_i_size) { + rdata->subreq.start + trans >= netfs_read_remote_i_size(inode)) { __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ba17ac5bf356..4fd1d796ad73 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -62,8 +62,8 @@ struct netfs_inode { struct fscache_cookie *cache; #endif struct mutex wb_lock; /* Writeback serialisation */ - loff_t remote_i_size; /* Size of the remote file */ - loff_t zero_point; /* Size after which we assume there's no data + loff_t _remote_i_size; /* Size of the remote file */ + loff_t _zero_point; /* Size after which we assume there's no data * on the server */ atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; @@ -474,6 +474,254 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode) return container_of(inode, struct netfs_inode, inode); } +/** + * netfs_read_remote_i_size - Read remote_i_size safely + * @inode: The inode to access + * + * Read remote_i_size safely without the potential for tearing on 32-bit + * arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline unsigned long long netfs_read_remote_i_size(const struct inode *inode) +{ + const struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); + unsigned long long remote_i_size; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + remote_i_size = ictx->_remote_i_size; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + remote_i_size = ictx->_remote_i_size; + preempt_enable(); +#else + /* Pairs with smp_store_release() in netfs_write_remote_i_size() */ + remote_i_size = smp_load_acquire(&ictx->_remote_i_size); +#endif + return remote_i_size; +} + +/* + * netfs_write_remote_i_size - Set remote_i_size safely + * @inode: The inode to access + * @remote_i_size: The new value for the size of the file on the server + * + * Set remote_i_size safely without the potential for tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_remote_i_size(), netfs_write_remote_i_size() does + * need locking around it (normally i_rwsem), otherwise on 32bit/SMP an update + * of i_size_seqcount can be lost, resulting in subsequent i_size_read() calls + * spinning forever. + */ +static inline void netfs_write_remote_i_size(struct inode *inode, + unsigned long long remote_i_size) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + ictx->_remote_i_size = remote_i_size; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + ictx->_remote_i_size = remote_i_size; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_remote_i_size() to + * ensure changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&ictx->_remote_i_size, remote_i_size); +#endif +} + +/** + * netfs_read_zero_point - Read zero_point safely + * @inode: The inode to access + * + * Read zero_point safely without the potential for tearing on 32-bit + * arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline unsigned long long netfs_read_zero_point(const struct inode *inode) +{ + struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); + unsigned long long zero_point; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + zero_point = ictx->_zero_point; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + zero_point = ictx->_zero_point; + preempt_enable(); +#else + /* Pairs with smp_store_release() in netfs_write_zero_point() */ + zero_point = smp_load_acquire(&ictx->_zero_point); +#endif + return zero_point; +} + +/* + * netfs_write_zero_point - Set zero_point safely + * @inode: The inode to access + * @zero_point: The new value for the point beyond which the server has no data + * + * Set zero_point safely without the potential for tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_zero_point(), netfs_write_zero_point() does need + * locking around it (normally i_rwsem), otherwise on 32bit/SMP an update of + * i_size_seqcount can be lost, resulting in subsequent read calls spinning + * forever. + */ +static inline void netfs_write_zero_point(struct inode *inode, + unsigned long long zero_point) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + ictx->_zero_point = zero_point; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + ictx->_zero_point = zero_point; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_zero_point() to + * ensure changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&ictx->_zero_point, zero_point); +#endif +} + +/** + * netfs_read_sizes - Read remote_i_size and zero_point safely + * @inode: The inode to access + * @i_size: Where to return the local file size. + * @remote_i_size: Where to return the size of the file on the server + * @zero_point: Where to return the the point beyond which the server has no data + * + * Read remote_i_size and zero_point safely without the potential for tearing + * on 32-bit arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline void netfs_read_sizes(const struct inode *inode, + unsigned long long *i_size, + unsigned long long *remote_i_size, + unsigned long long *zero_point) +{ + const struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + *i_size = inode->i_size; + *remote_i_size = ictx->_remote_i_size; + *zero_point = ictx->_zero_point; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + *i_size = inode->i_size; + *remote_i_size = ictx->_remote_i_size; + *zero_point = ictx->_zero_point; + preempt_enable(); +#else + /* Pairs with smp_store_release() in i_size_write() */ + *i_size = smp_load_acquire(&inode->i_size); + /* Pairs with smp_store_release() in netfs_write_remote_i_size() */ + *remote_i_size = smp_load_acquire(&ictx->_remote_i_size); + /* Pairs with smp_store_release() in netfs_write_zero_point() */ + *zero_point = smp_load_acquire(&ictx->_zero_point); +#endif +} + +/* + * netfs_write_sizes - Set i_size, remote_i_size and zero_point safely + * @inode: The inode to access + * @i_size: The new value for the local size of the file + * @remote_i_size: The new value for the size of the file on the server + * @zero_point: The new value for the point beyond which the server has no data + * + * Set both remote_i_size and zero_point safely without the potential for + * tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_zero_point(), netfs_write_zero_point() does need + * locking around it (normally i_rwsem), otherwise on 32bit/SMP an update of + * i_size_seqcount can be lost, resulting in subsequent read calls spinning + * forever. + */ +static inline void netfs_write_sizes(struct inode *inode, + unsigned long long i_size, + unsigned long long remote_i_size, + unsigned long long zero_point) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + inode->i_size = i_size; + ictx->_remote_i_size = remote_i_size; + ictx->_zero_point = zero_point; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + inode->i_size = i_size; + ictx->_remote_i_size = remote_i_size; + ictx->_zero_point = zero_point; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in i_size_read(), + * netfs_read_remote_i_size() and netfs_read_zero_point() to ensure + * changes related to inode size (such as page contents) are visible + * before we see the changed inode size. + */ + smp_store_release(&inode->i_size, i_size); + smp_store_release(&ictx->_remote_i_size, remote_i_size); + smp_store_release(&ictx->_zero_point, zero_point); +#endif +} + /** * netfs_inode_init - Initialise a netfslib inode context * @ctx: The netfs inode to initialise @@ -488,8 +736,8 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, bool use_zero_point) { ctx->ops = ops; - ctx->remote_i_size = i_size_read(&ctx->inode); - ctx->zero_point = LLONG_MAX; + ctx->_remote_i_size = i_size_read(&ctx->inode); + ctx->_zero_point = LLONG_MAX; ctx->flags = 0; atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) @@ -498,7 +746,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, mutex_init(&ctx->wb_lock); /* ->releasepage() drives zero_point */ if (use_zero_point) { - ctx->zero_point = ctx->remote_i_size; + ctx->_zero_point = ctx->_remote_i_size; mapping_set_release_always(ctx->inode.i_mapping); } } @@ -511,13 +759,40 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, * * Inform the netfs lib that a file got resized so that it can adjust its state. */ -static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size, +static inline void netfs_resize_file(struct netfs_inode *ictx, + unsigned long long new_i_size, bool changed_on_server) { +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + struct inode *inode = &ictx->inode; + + preempt_disable(); + write_seqcount_begin(&inode->i_size_seqcount); + if (changed_on_server) + ictx->_remote_i_size = new_i_size; + if (new_i_size < ictx->_zero_point) + ictx->_zero_point = new_i_size; + write_seqcount_end(&inode->i_size_seqcount); + preempt_enable(); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); if (changed_on_server) - ctx->remote_i_size = new_i_size; - if (new_i_size < ctx->zero_point) - ctx->zero_point = new_i_size; + ictx->_remote_i_size = new_i_size; + if (new_i_size < ictx->_zero_point) + ictx->_zero_point = new_i_size; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_remote_i_size and + * netfs_read_zero_point() to ensure changes related to inode size + * (such as page contents) are visible before we see the changed inode + * size. + */ + if (changed_on_server) + smp_store_release(&ictx->_remote_i_size, new_i_size); + if (new_i_size < ictx->_zero_point) + smp_store_release(&ictx->_zero_point, new_i_size); +#endif } /** -- cgit v1.2.3 From 156ac2ec2ee77c44c4eb7439d6d165247ba12247 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:48 +0100 Subject: netfs: Fix netfs_invalidate_folio() to clear dirty bit if all changes gone If a streaming write is made, this will leave the relevant modified folio in a not-uptodate, but dirty state with a netfs_folio struct hung off of folio->private indicating the dirty range. Subsequently truncating the file such that the dirty data in the folio is removed, but the first part of the folio theoretically remains will cause the netfs_folio struct to be discarded... but will leave the dirty flag set. If the folio is then read via mmap(), netfs_read_folio() will see that the page is dirty and jump to netfs_read_gaps() to fill in the missing bits. netfs_read_gaps(), however, expects there to be a netfs_folio struct present and can oops because truncate removed it. Fix this by calling folio_cancel_dirty() in netfs_invalidate_folio() in the event that all the dirty data in the folio is erased (as nfs does). Also add some tracepoints to log modifications to a dirty page. This can be reproduced with something like: dd if=/dev/zero of=/xfstest.test/foo bs=1M count=1 umount /xfstest.test mount /xfstest.test xfs_io -c "w 0xbbbf 0xf96c" \ -c "truncate 0xbbbf" \ -c "mmap -r 0xb000 0x11000" \ -c "mr 0xb000 0x11000" \ /xfstest.test/foo with fscaching disabled (otherwise streaming writes are suppressed) and a change to netfs_perform_write() to disallow streaming writes if the fd is open O_RDWR: if (//(file->f_mode & FMODE_READ) || <--- comment this out netfs_is_cache_enabled(ctx)) { It should be reproducible even without this change, but if prevents the above trivial xfs_io command from reproducing it. Note that the initial dd is important: the file must start out sufficiently large that the zero-point logic doesn't just clear the gaps because it knows there's nothing in the file to read yet. Unmounting and mounting is needed to clear the pagecache (there are other ways to do that that may also work). This was initially reproduced with the generic/522 xfstest on some patches that remove the FMODE_READ restriction. Fixes: 9ebff83e6481 ("netfs: Prep to use folio->private for write grouping and streaming write") Reported-by: Marc Dionne Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-12-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/misc.c | 6 +++++- include/trace/events/netfs.h | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 723571ca1b88..24b20e80e9a8 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -263,6 +263,7 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) /* Move the start of the data. */ finfo->dirty_len = fend - iend; finfo->dirty_offset = offset; + trace_netfs_folio(folio, netfs_folio_trace_invalidate_front); return; } @@ -271,12 +272,14 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) */ if (iend >= fend) { finfo->dirty_len = offset - fstart; + trace_netfs_folio(folio, netfs_folio_trace_invalidate_tail); return; } /* A partial write was split. The caller has already zeroed * it, so just absorb the hole. */ + trace_netfs_folio(folio, netfs_folio_trace_invalidate_middle); } return; @@ -284,8 +287,9 @@ erase_completely: netfs_put_group(netfs_folio_group(folio)); folio_detach_private(folio); folio_clear_uptodate(folio); + folio_cancel_dirty(folio); kfree(finfo); - return; + trace_netfs_folio(folio, netfs_folio_trace_invalidate_all); } EXPORT_SYMBOL(netfs_invalidate_folio); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 8c936fc575d5..0b702f74aefe 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -194,6 +194,10 @@ EM(netfs_folio_trace_copy_to_cache, "mark-copy") \ EM(netfs_folio_trace_end_copy, "end-copy") \ EM(netfs_folio_trace_filled_gaps, "filled-gaps") \ + EM(netfs_folio_trace_invalidate_all, "inval-all") \ + EM(netfs_folio_trace_invalidate_front, "inval-front") \ + EM(netfs_folio_trace_invalidate_middle, "inval-mid") \ + EM(netfs_folio_trace_invalidate_tail, "inval-tail") \ EM(netfs_folio_trace_kill, "kill") \ EM(netfs_folio_trace_kill_cc, "kill-cc") \ EM(netfs_folio_trace_kill_g, "kill-g") \ -- cgit v1.2.3 From 7b4dcf1b9455a6e52ac7478b4057dbe10359576d Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:50 +0100 Subject: netfs: Fix streaming write being overwritten In order to avoid reading whilst writing, netfslib will allow "streaming writes" in which dirty data is stored directly into folios without reading them first. Such folios are marked dirty but may not be marked uptodate. If a folio is entirely written by a streaming write, uptodate will be set, otherwise it will have a netfs_folio struct attached to ->private recording the dirty region. In the event that a partially written streaming write page is to be overwritten entirely by a single write(), netfs_perform_write() will try to copy over it, but doesn't discard the netfs_folio if it succeeds; further, it doesn't correctly handle a partial copy that overwrites some of the dirty data. Fix this by the following: (1) If the folio is successfully overwritten, free the netfs_folio struct before marking the page uptodate. (2) If the copy to the folio partially fails, but short of the dirty data, just ignore the copy. (3) If the copy partially fails and overwrites some of the dirty data, accept the copy, update the netfs_folio struct to record the new data. If the folio is now filled, free the netfs_folio and set uptodate, otherwise return a partial write. Found with: fsx -q -N 1000000 -p 10000 -o 128000 -l 600000 \ /xfstest.test/junk --replay-ops=junk.fsxops using the following as junk.fsxops: truncate 0x0 0 0x927c0 write 0x63fb8 0x53c8 0 copy_range 0xb704 0x19b9 0x24429 0x79380 write 0x2402b 0x144a2 0x90660 * write 0x204d5 0x140a0 0x927c0 * copy_range 0x1f72c 0x137d0 0x7a906 0x927c0 * read 0x00000 0x20000 0x9157c read 0x20000 0x20000 0x9157c read 0x40000 0x20000 0x9157c read 0x60000 0x20000 0x9157c read 0x7e1a0 0xcfb9 0x9157c on cifs with the default cache option. It shows folio 0x24 misbehaving if the FMODE_READ check is commented out in netfs_perform_write(): if (//(file->f_mode & FMODE_READ) || netfs_is_cache_enabled(ctx)) { and no fscache. This was initially found with the generic/522 xfstest. Fixes: 8f52de0077ba ("netfs: Reduce number of conditional branches in netfs_perform_write()") Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-14-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 47 ++++++++++++++++++++++++++++++++------------ include/trace/events/netfs.h | 3 +++ 2 files changed, 37 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 278aeb074e75..991552724868 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -246,18 +246,38 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, /* See if we can write a whole folio in one go. */ if (!maybe_trouble && offset == 0 && part >= flen) { copied = copy_folio_from_iter_atomic(folio, offset, part, iter); - if (unlikely(copied == 0)) + if (likely(copied == part)) { + if (finfo) { + trace = netfs_whole_folio_modify_filled; + goto folio_now_filled; + } + __netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + trace = netfs_whole_folio_modify; + goto copied; + } + if (copied == 0) goto copy_failed; - if (unlikely(copied < part)) { + if (!finfo || copied <= finfo->dirty_offset) { maybe_trouble = true; iov_iter_revert(iter, copied); copied = 0; folio_unlock(folio); goto retry; } - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); - trace = netfs_whole_folio_modify; + + /* We overwrote some existing dirty data, so we have to + * accept the partial write. + */ + finfo->dirty_len += finfo->dirty_offset; + if (finfo->dirty_len == flen) { + trace = netfs_whole_folio_modify_filled_efault; + goto folio_now_filled; + } + if (copied > finfo->dirty_len) + finfo->dirty_len = copied; + finfo->dirty_offset = 0; + trace = netfs_whole_folio_modify_efault; goto copied; } @@ -327,16 +347,10 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto copy_failed; finfo->dirty_len += copied; if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { - if (finfo->netfs_group) - folio_change_private(folio, finfo->netfs_group); - else - folio_detach_private(folio); - folio_mark_uptodate(folio); - kfree(finfo); trace = netfs_streaming_cont_filled_page; - } else { - trace = netfs_streaming_write_cont; + goto folio_now_filled; } + trace = netfs_streaming_write_cont; goto copied; } @@ -350,6 +364,13 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto out; continue; + folio_now_filled: + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + folio_mark_uptodate(folio); + kfree(finfo); copied: trace_netfs_folio(folio, trace); flush_dcache_folio(folio); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 0b702f74aefe..aa9940ba307b 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -177,6 +177,9 @@ EM(netfs_folio_is_uptodate, "mod-uptodate") \ EM(netfs_just_prefetch, "mod-prefetch") \ EM(netfs_whole_folio_modify, "mod-whole-f") \ + EM(netfs_whole_folio_modify_efault, "mod-whole-f!") \ + EM(netfs_whole_folio_modify_filled, "mod-whole-f+") \ + EM(netfs_whole_folio_modify_filled_efault, "mod-whole-f+!") \ EM(netfs_modify_and_clear, "mod-n-clear") \ EM(netfs_streaming_write, "mod-streamw") \ EM(netfs_streaming_write_cont, "mod-streamw+") \ -- cgit v1.2.3 From dbe556972100fabb8e5a1b3d2163831ff07b1e8e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:56 +0100 Subject: netfs: Fix potential UAF in netfs_unlock_abandoned_read_pages() netfs_unlock_abandoned_read_pages(rreq) accesses the index of the folios it is wanting to unlock and compares that to rreq->no_unlock_folio so that it doesn't unlock a folio being read for netfs_perform_write() or netfs_write_begin(). However, given that netfs_unlock_abandoned_read_pages() is called _after_ NETFS_RREQ_IN_PROGRESS is cleared, the one folio that it's not allowed to dereference is the one specified by ->no_unlock_folio as ownership immediately reverts to the caller. Fix this by storing the folio pointer instead and using that rather than the index. Also fix netfs_unlock_read_folio() where the same applies. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-20-dhowells@redhat.com cc: Paulo Alcantara cc: Viacheslav Dubeyko cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 4 ++-- fs/netfs/read_collect.c | 2 +- fs/netfs/read_retry.c | 2 +- include/linux/netfs.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 004d426c02b4..83d0b8153e96 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -670,7 +670,7 @@ retry: ret = PTR_ERR(rreq); goto error; } - rreq->no_unlock_folio = folio->index; + rreq->no_unlock_folio = folio; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); ret = netfs_begin_cache_read(rreq, ctx); @@ -736,7 +736,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, goto error; } - rreq->no_unlock_folio = folio->index; + rreq->no_unlock_folio = folio; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); ret = netfs_begin_cache_read(rreq, ctx); if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 3c9b847885c2..23660a590124 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -83,7 +83,7 @@ static void netfs_unlock_read_folio(struct netfs_io_request *rreq, } just_unlock: - if (folio->index == rreq->no_unlock_folio && + if (folio == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { _debug("no unlock"); } else { diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index e10eb5a07332..f59a70f3a086 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -292,7 +292,7 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq) struct folio *folio = folioq_folio(p, slot); if (folio && !folioq_is_marked2(p, slot)) { - if (folio->index == rreq->no_unlock_folio && + if (folio == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { _debug("no unlock"); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 4fd1d796ad73..243c0f737938 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -252,7 +252,7 @@ struct netfs_io_request { unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ unsigned long long abandon_to; /* Position to abandon folios to */ - pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ + const struct folio *no_unlock_folio; /* Don't unlock this folio after read */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; unsigned int rsize; /* Maximum read size (0 for none) */ -- cgit v1.2.3 From ccde2ac757c713535b224233a296de40efe5212d Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:58 +0100 Subject: netfs: Fix folio->private handling in netfs_perform_write() Under some circumstances, netfs_perform_write() doesn't correctly manipulate folio->private between NULL, NETFS_FOLIO_COPY_TO_CACHE, pointing to a group and pointing to a netfs_folio struct, leading to potential multiple attachments of private data with associated folio ref leaks and also leaks of netfs_folio structs or netfs_group refs. Fix this by consolidating the place at which a folio is marked uptodate in one place and having that look at what's attached to folio->private and decide how to clean it up and then set the new group. Also, the content shouldn't be flushed if group is NULL, even if a group is specified in the netfs_group parameter, as that would be the case for a new folio. A filesystem should always specify netfs_group or never specify netfs_group. The Sashiko auto-review tool noted that it was theoretically possible that the fpos >= ctx->zero_point section might leak if it modified a streaming write folio. This is unlikely, but with a network filesystem, third party changes can happen. It also pointed out that __netfs_set_group() would leak if called multiple times on the same folio from the "whole folio modify section". Fixes: 8f52de0077ba ("netfs: Reduce number of conditional branches in netfs_perform_write()") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-22-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 134 ++++++++++++++++++++++++++----------------- include/trace/events/netfs.h | 1 + 2 files changed, 82 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index f79fb5996540..6bde3320bcec 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -12,24 +12,6 @@ #include #include "internal.h" -static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) -{ - if (netfs_group) - folio_attach_private(folio, netfs_get_group(netfs_group)); -} - -static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) -{ - void *priv = folio_get_private(folio); - - if (unlikely(priv != netfs_group)) { - if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) - folio_attach_private(folio, netfs_get_group(netfs_group)); - else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) - folio_detach_private(folio); - } -} - /* * Grab a folio for writing and lock it. Attempt to allocate as large a folio * as possible to hold as much of the remaining length as possible in one go. @@ -157,6 +139,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, size_t offset; /* Offset into pagecache folio */ size_t part; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ + void *priv; offset = pos & (max_chunk - 1); part = min(max_chunk - offset, iov_iter_count(iter)); @@ -202,6 +185,25 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto error_folio_unlock; } + finfo = netfs_folio_info(folio); + group = netfs_folio_group(folio); + + /* If the requested group differs from the group set on the + * page, then we need to flush out the folio if it has a group + * set (ie. is non-NULL). Note that COPY_TO_CACHE is a special + * case, being a netfs annotation rather than an actual group. + * + * The filesystem isn't permitted to mix writes with groups and + * writes without groups as the NULL group is used to indicate + * that no group is set. + */ + if (unlikely(group != netfs_group) && + group != NETFS_FOLIO_COPY_TO_CACHE && + group) { + WARN_ON_ONCE(!netfs_group); + goto flush_content; + } + /* Decide how we should modify a folio. We might be attempting * to do write-streaming, as we don't want to a local RMW cycle * if we can avoid it. If we're doing local caching or content @@ -209,22 +211,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, * file is open readably, then we let ->read_folio() fill in * the gaps. */ - finfo = netfs_folio_info(folio); - group = netfs_folio_group(folio); - - if (unlikely(group != netfs_group) && - group != NETFS_FOLIO_COPY_TO_CACHE) - goto flush_content; - if (folio_test_uptodate(folio)) { if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; - netfs_set_group(folio, netfs_group); trace = netfs_folio_is_uptodate; - goto copied; + goto copied_uptodate; } /* If the page is above the zero-point then we assume that the @@ -237,24 +231,22 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(copied == 0)) goto copy_failed; folio_zero_segment(folio, offset + copied, flen); - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); - trace = netfs_modify_and_clear; - goto copied; + if (finfo) + trace = netfs_modify_and_clear_rm_finfo; + else + trace = netfs_modify_and_clear; + goto mark_uptodate; } /* See if we can write a whole folio in one go. */ if (!maybe_trouble && offset == 0 && part >= flen) { copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (likely(copied == part)) { - if (finfo) { + if (finfo) trace = netfs_whole_folio_modify_filled; - goto folio_now_filled; - } - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); - trace = netfs_whole_folio_modify; - goto copied; + else + trace = netfs_whole_folio_modify; + goto mark_uptodate; } if (copied == 0) goto copy_failed; @@ -272,7 +264,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, finfo->dirty_len += finfo->dirty_offset; if (finfo->dirty_len == flen) { trace = netfs_whole_folio_modify_filled_efault; - goto folio_now_filled; + goto mark_uptodate; } if (copied > finfo->dirty_len) finfo->dirty_len = copied; @@ -300,11 +292,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; - netfs_set_group(folio, netfs_group); trace = netfs_just_prefetch; - goto copied; + goto copied_uptodate; } + /* Do a streaming write on a folio that has nothing in it yet. */ if (!finfo) { ret = -EIO; if (WARN_ON(folio_get_private(folio))) @@ -313,10 +305,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(copied == 0)) goto copy_failed; if (offset == 0 && copied == flen) { - __netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); trace = netfs_streaming_filled_page; - goto copied; + goto mark_uptodate; } finfo = kzalloc_obj(*finfo); @@ -345,7 +335,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, finfo->dirty_len += copied; if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { trace = netfs_streaming_cont_filled_page; - goto folio_now_filled; + goto mark_uptodate; } trace = netfs_streaming_write_cont; goto copied; @@ -361,13 +351,36 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto out; continue; - folio_now_filled: - if (finfo->netfs_group) - folio_change_private(folio, finfo->netfs_group); - else - folio_detach_private(folio); + /* Mark a folio as being up to data when we've filled it + * completely. If the folio has a group attached, then it must + * be the same group, otherwise we should have flushed it out + * above. We have to get rid of the netfs_folio struct if + * there was one. + */ + mark_uptodate: folio_mark_uptodate(folio); - kfree(finfo); + + copied_uptodate: + priv = folio_get_private(folio); + if (likely(priv == netfs_group)) { + /* Already set correctly; no change required. */ + } else if (priv == NETFS_FOLIO_COPY_TO_CACHE) { + if (!netfs_group) + folio_detach_private(folio); + else + folio_change_private(folio, netfs_get_group(netfs_group)); + } else if (!priv) { + folio_attach_private(folio, netfs_get_group(netfs_group)); + } else { + WARN_ON_ONCE(!finfo); + if (netfs_group) + /* finfo->netfs_group has a ref */ + folio_change_private(folio, netfs_group); + else + folio_detach_private(folio); + kfree(finfo); + } + copied: trace_netfs_folio(folio, trace); flush_dcache_folio(folio); @@ -530,6 +543,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr struct inode *inode = file_inode(file); struct netfs_inode *ictx = netfs_inode(inode); vm_fault_t ret = VM_FAULT_NOPAGE; + void *priv; int err; _enter("%lx", folio->index); @@ -550,7 +564,9 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr } group = netfs_folio_group(folio); - if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) { + if (group && + group != netfs_group && + group != NETFS_FOLIO_COPY_TO_CACHE) { folio_unlock(folio); err = filemap_fdatawrite_range(mapping, folio_pos(folio), @@ -572,7 +588,19 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); else trace_netfs_folio(folio, netfs_folio_trace_mkwrite); - netfs_set_group(folio, netfs_group); + + priv = folio_get_private(folio); + if (priv != netfs_group) { + if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) + folio_detach_private(folio); + else if (netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) + folio_change_private(folio, netfs_get_group(netfs_group)); + else if (netfs_group && !priv) + folio_attach_private(folio, netfs_get_group(netfs_group)); + else + WARN_ON_ONCE(1); + } + file_update_time(file); set_bit(NETFS_ICTX_MODIFIED_ATTR, &ictx->flags); if (ictx->ops->post_modify) diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index aa9940ba307b..082cb03c6131 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -181,6 +181,7 @@ EM(netfs_whole_folio_modify_filled, "mod-whole-f+") \ EM(netfs_whole_folio_modify_filled_efault, "mod-whole-f+!") \ EM(netfs_modify_and_clear, "mod-n-clear") \ + EM(netfs_modify_and_clear_rm_finfo, "mod-n-clear+") \ EM(netfs_streaming_write, "mod-streamw") \ EM(netfs_streaming_write_cont, "mod-streamw+") \ EM(netfs_flush_content, "flush") \ -- cgit v1.2.3 From 620072fd783290ad92c2d445a47b0a61b161f352 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Apr 2026 12:31:17 -0700 Subject: mm/damon: fix damos_stat tracepoint format for sz_applied The print format is wrongly marking sz_applied as sz_tried. Fix it. Link: https://lore.kernel.org/20260426193119.88095-1-sj@kernel.org Fixes: 804c26b961da ("mm/damon/core: add trace point for damos stat per apply interval") Signed-off-by: SeongJae Park Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: # 7.0.x Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 24fc402ab3c8..7e25f4469b81 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -41,7 +41,7 @@ TRACE_EVENT(damos_stat_after_apply_interval, ), TP_printk("ctx_idx=%u scheme_idx=%u nr_tried=%lu sz_tried=%lu " - "nr_applied=%lu sz_tried=%lu sz_ops_filter_passed=%lu " + "nr_applied=%lu sz_applied=%lu sz_ops_filter_passed=%lu " "qt_exceeds=%lu nr_snapshots=%lu", __entry->context_idx, __entry->scheme_idx, __entry->nr_tried, __entry->sz_tried, -- cgit v1.2.3 From 6a288a4ddb4a994490505ab5f41c445f8e6b6467 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Tue, 21 Apr 2026 17:39:07 +0200 Subject: mm/page_alloc: fix initialization of tags of the huge zero folio with init_on_free __GFP_ZEROTAGS semantics are currently a bit weird, but effectively this flag is only ever set alongside __GFP_ZERO and __GFP_SKIP_KASAN. If we run with init_on_free, we will zero out pages during __free_pages_prepare(), to skip zeroing on the allocation path. However, when allocating with __GFP_ZEROTAG set, post_alloc_hook() will consequently not only skip clearing page content, but also skip clearing tag memory. Not clearing tags through __GFP_ZEROTAGS is irrelevant for most pages that will get mapped to user space through set_pte_at() later: set_pte_at() and friends will detect that the tags have not been initialized yet (PG_mte_tagged not set), and initialize them. However, for the huge zero folio, which will be mapped through a PMD marked as special, this initialization will not be performed, ending up exposing whatever tags were still set for the pages. The docs (Documentation/arch/arm64/memory-tagging-extension.rst) state that allocation tags are set to 0 when a page is first mapped to user space. That no longer holds with the huge zero folio when init_on_free is enabled. Fix it by decoupling __GFP_ZEROTAGS from __GFP_ZERO, passing to tag_clear_highpages() whether we want to also clear page content. Invert the meaning of the tag_clear_highpages() return value to have clearer semantics. Reproduced with the huge zero folio by modifying the check_buffer_fill arm64/mte selftest to use a 2 MiB area, after making sure that pages have a non-0 tag set when freeing (note that, during boot, we will not actually initialize tags, but only set KASAN_TAG_KERNEL in the page flags). $ ./check_buffer_fill 1..20 ... not ok 17 Check initial tags with private mapping, sync error mode and mmap memory not ok 18 Check initial tags with private mapping, sync error mode and mmap/mprotect memory ... This code needs more cleanups; we'll tackle that next, like decoupling __GFP_ZEROTAGS from __GFP_SKIP_KASAN. [akpm@linux-foundation.org: s/__GPF_ZERO/__GFP_ZERO/, per David] Link: https://lore.kernel.org/20260421-zerotags-v2-1-05cb1035482e@kernel.org Fixes: adfb6609c680 ("mm/huge_memory: initialise the tags of the huge zero folio") Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Catalin Marinas Tested-by: Lance Yang Cc: Brendan Jackman Cc: Dev Jain Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Will Deacon Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- arch/arm64/include/asm/page.h | 2 +- arch/arm64/mm/fault.c | 11 +++++++---- include/linux/gfp_types.h | 10 +++++----- include/linux/highmem.h | 7 ++++--- mm/page_alloc.c | 8 ++++---- 5 files changed, 21 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index e25d0d18f6d7..58200de8a221 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -33,7 +33,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr); #define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio -bool tag_clear_highpages(struct page *to, int numpages); +bool tag_clear_highpages(struct page *to, int numpages, bool clear_pages); #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 0f3c5c7ca054..739800835920 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -1018,7 +1018,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, return vma_alloc_folio(flags, 0, vma, vaddr); } -bool tag_clear_highpages(struct page *page, int numpages) +bool tag_clear_highpages(struct page *page, int numpages, bool clear_pages) { /* * Check if MTE is supported and fall back to clear_highpage(). @@ -1026,13 +1026,16 @@ bool tag_clear_highpages(struct page *page, int numpages) * post_alloc_hook() will invoke tag_clear_highpages(). */ if (!system_supports_mte()) - return false; + return clear_pages; /* Newly allocated pages, shouldn't have been tagged yet */ for (int i = 0; i < numpages; i++, page++) { WARN_ON_ONCE(!try_page_mte_tagging(page)); - mte_zero_clear_page_tags(page_address(page)); + if (clear_pages) + mte_zero_clear_page_tags(page_address(page)); + else + mte_clear_page_tags(page_address(page)); set_page_mte_tagged(page); } - return true; + return false; } diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6c75df30a281..cd4972a7c97c 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -273,11 +273,11 @@ enum { * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself - * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that - * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting - * memory tags at the same time as zeroing memory has minimal additional - * performance impact. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time. Setting memory tags at + * the same time as zeroing memory (e.g., with __GFP_ZERO) has minimal + * additional performance impact. However, __GFP_ZEROTAGS also zeroes the tags + * even if memory is not getting zeroed at allocation time (e.g., + * with init_on_free). * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by diff --git a/include/linux/highmem.h b/include/linux/highmem.h index af03db851a1d..d7aac9de1c8a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -347,10 +347,11 @@ static inline void clear_highpage_kasan_tagged(struct page *page) #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -/* Return false to let people know we did not initialize the pages */ -static inline bool tag_clear_highpages(struct page *page, int numpages) +/* Returns true if the caller has to initialize the pages */ +static inline bool tag_clear_highpages(struct page *page, int numpages, + bool clear_pages) { - return false; + return clear_pages; } #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 227d58dc3de6..23c7298d3be2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1808,9 +1808,9 @@ static inline bool should_skip_init(gfp_t flags) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + const bool zero_tags = gfp_flags & __GFP_ZEROTAGS; bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); - bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); int i; set_page_private(page, 0); @@ -1832,11 +1832,11 @@ inline void post_alloc_hook(struct page *page, unsigned int order, */ /* - * If memory tags should be zeroed - * (which happens only when memory should be initialized as well). + * Clearing tags can efficiently clear the memory for us as well, if + * required. */ if (zero_tags) - init = !tag_clear_highpages(page, 1 << order); + init = tag_clear_highpages(page, 1 << order, /* clear_pages= */init); if (!should_skip_kasan_unpoison(gfp_flags) && kasan_unpoison_pages(page, order, init)) { -- cgit v1.2.3 From e83f5e24da741fa9405aeeff00b08c5ee7c37b88 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Wed, 6 May 2026 19:43:30 +0800 Subject: Bluetooth: serialize accept_q access bt_sock_poll() walks the accept queue without synchronization, while child teardown can unlink the same socket and drop its last reference. The unsynchronized accept queue walk has existed since the initial Bluetooth import. Protect accept_q with a dedicated lock for queue updates and polling. Also rework bt_accept_dequeue() to take temporary child references under the queue lock before dropping it and locking the child socket. Fixes: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Reported-by: Jann Horn Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Signed-off-by: Jiexun Wang Reviewed-by: Jann Horn Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 1 + net/bluetooth/af_bluetooth.c | 87 +++++++++++++++++++++++++++++---------- 2 files changed, 66 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 69eed69f7f26..3faea66b1979 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -398,6 +398,7 @@ void baswap(bdaddr_t *dst, const bdaddr_t *src); struct bt_sock { struct sock sk; struct list_head accept_q; + spinlock_t accept_q_lock; /* protects accept_q */ struct sock *parent; unsigned long flags; void (*skb_msg_name)(struct sk_buff *, void *, int *); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 33d053d63407..9d68dd86023c 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -154,6 +154,7 @@ struct sock *bt_sock_alloc(struct net *net, struct socket *sock, sock_init_data(sock, sk); INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + spin_lock_init(&bt_sk(sk)->accept_q_lock); sock_reset_flag(sk, SOCK_ZAPPED); @@ -214,6 +215,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) { const struct cred *old_cred; struct pid *old_pid; + struct bt_sock *par = bt_sk(parent); BT_DBG("parent %p, sk %p", parent, sk); @@ -224,9 +226,13 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) else lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); bt_sk(sk)->parent = parent; + spin_lock_bh(&par->accept_q_lock); + list_add_tail(&bt_sk(sk)->accept_q, &par->accept_q); + sk_acceptq_added(parent); + spin_unlock_bh(&par->accept_q_lock); + /* Copy credentials from parent since for incoming connections the * socket is allocated by the kernel. */ @@ -244,8 +250,6 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) bh_unlock_sock(sk); else release_sock(sk); - - sk_acceptq_added(parent); } EXPORT_SYMBOL(bt_accept_enqueue); @@ -254,45 +258,72 @@ EXPORT_SYMBOL(bt_accept_enqueue); */ void bt_accept_unlink(struct sock *sk) { + struct sock *parent = bt_sk(sk)->parent; + BT_DBG("sk %p state %d", sk, sk->sk_state); + spin_lock_bh(&bt_sk(parent)->accept_q_lock); list_del_init(&bt_sk(sk)->accept_q); - sk_acceptq_removed(bt_sk(sk)->parent); + sk_acceptq_removed(parent); + spin_unlock_bh(&bt_sk(parent)->accept_q_lock); bt_sk(sk)->parent = NULL; sock_put(sk); } EXPORT_SYMBOL(bt_accept_unlink); +static struct sock *bt_accept_get(struct sock *parent, struct sock *sk) +{ + struct bt_sock *bt = bt_sk(parent); + struct sock *next = NULL; + + /* accept_q is modified from child teardown paths too, so take a + * temporary reference before dropping the queue lock. + */ + spin_lock_bh(&bt->accept_q_lock); + + if (sk) { + if (bt_sk(sk)->parent != parent) + goto out; + + if (!list_is_last(&bt_sk(sk)->accept_q, &bt->accept_q)) { + next = &list_next_entry(bt_sk(sk), accept_q)->sk; + sock_hold(next); + } + } else if (!list_empty(&bt->accept_q)) { + next = &list_first_entry(&bt->accept_q, + struct bt_sock, accept_q)->sk; + sock_hold(next); + } + +out: + spin_unlock_bh(&bt->accept_q_lock); + return next; +} + struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) { - struct bt_sock *s, *n; - struct sock *sk; + struct sock *sk, *next; BT_DBG("parent %p", parent); restart: - list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { - sk = (struct sock *)s; - + for (sk = bt_accept_get(parent, NULL); sk; sk = next) { /* Prevent early freeing of sk due to unlink and sock_kill */ - sock_hold(sk); lock_sock(sk); /* Check sk has not already been unlinked via * bt_accept_unlink() due to serialisation caused by sk locking */ - if (!bt_sk(sk)->parent) { + if (bt_sk(sk)->parent != parent) { BT_DBG("sk %p, already unlinked", sk); release_sock(sk); sock_put(sk); - /* Restart the loop as sk is no longer in the list - * and also avoid a potential infinite loop because - * list_for_each_entry_safe() is not thread safe. - */ goto restart; } + next = bt_accept_get(parent, sk); + /* sk is safely in the parent list so reduce reference count */ sock_put(sk); @@ -310,6 +341,8 @@ restart: sock_graft(sk, newsock); release_sock(sk); + if (next) + sock_put(next); return sk; } @@ -518,18 +551,28 @@ EXPORT_SYMBOL(bt_sock_stream_recvmsg); static inline __poll_t bt_accept_poll(struct sock *parent) { - struct bt_sock *s, *n; + struct bt_sock *bt = bt_sk(parent); + struct bt_sock *s; struct sock *sk; + __poll_t mask = 0; + + spin_lock_bh(&bt->accept_q_lock); + list_for_each_entry(s, &bt->accept_q, accept_q) { + int state; - list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { sk = (struct sock *)s; - if (sk->sk_state == BT_CONNECTED || - (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) && - sk->sk_state == BT_CONNECT2)) - return EPOLLIN | EPOLLRDNORM; + state = READ_ONCE(sk->sk_state); + + if (state == BT_CONNECTED || + (test_bit(BT_SK_DEFER_SETUP, &bt->flags) && + state == BT_CONNECT2)) { + mask = EPOLLIN | EPOLLRDNORM; + break; + } } + spin_unlock_bh(&bt->accept_q_lock); - return 0; + return mask; } __poll_t bt_sock_poll(struct file *file, struct socket *sock, -- cgit v1.2.3 From 5522d65d81a711c60a9969d37a485d48d0ad1496 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 10 May 2026 13:46:05 +0300 Subject: ipvs: avoid possible loop in ip_vs_dst_event on resizing Sashiko points out that unprivileged user can frequently call ip_vs_flush() or ip_vs_del_service() to trigger svc_table_changes updates that can lead to infinite loop in ip_vs_dst_event(). This can also happen if the user triggers frequent table resizing without deleting all services. We should also consider the possible effects if the user triggers many NETDEV_DOWN events. One way to solve it is to hold svc_resize_sem in ip_vs_dst_event() but this can block the dev notifier during the whole resizing process. Instead, use new rw_semaphore svc_replace_sem to protect just the svc_table replacement which is a short code section. Then hold svc_replace_sem in ip_vs_dst_event() to serialize with replacing the svc_table. As result, loop is avoided as there is no need to repeat the table walking from the start. By this way changes in svc_table_changes can happen only when all services are removed and all dev references dropped which allows us to abort the table walking. As IP_VS_WORK_SVC_NORESIZE is the flag used to stop the svc_resize_work under service_mutex, we should check only this flag often but not while under service_mutex. To remove the mutex_trylock() for service_mutex in the second phase where the resizer installs the new table after rehashing, we will avoid holding the service_mutex there. As result, the code in configuration context which is under service_mutex should access ipvs->svc_table under RCU because it can be replaced at anytime and released after a RCU grace period. As for ip_vs_zero_all(), it needs different solution as a table walker which can escape single RCU read-side critical section: to hold the svc_replace_sem to prevent table to be replaced. In ip_vs_status_show() prefer to hold svc_replace_sem to avoid many loops, just detect if the svc_table is removed. Prefer the newly attached table for the u_thresh/l_thresh checks to know when to grow/shrink while adding or deleting services because the new table size is based on the latest parameters. Link: https://sashiko.dev/#/patchset/20260505001648.360569-1-pablo%40netfilter.org Fixes: 840aac3d900d ("ipvs: use resizable hash table for services") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 3 +- net/netfilter/ipvs/ip_vs_ctl.c | 187 +++++++++++++++++++++++++++-------------- 2 files changed, 124 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 02762ce73a0c..a02e569813d2 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1186,8 +1186,9 @@ struct netns_ipvs { struct timer_list dest_trash_timer; /* expiration timer */ struct mutex service_mutex; /* service reconfig */ struct rw_semaphore svc_resize_sem; /* svc_table resizing */ + struct rw_semaphore svc_replace_sem; /* svc_table replace */ struct delayed_work svc_resize_work; /* resize svc_table */ - atomic_t svc_table_changes;/* ++ on new table */ + atomic_t svc_table_changes;/* ++ on table changes */ /* Service counters */ atomic_t num_services[IP_VS_AF_MAX]; /* Services */ atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */ diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c7c7f6a7a9f6..bd9cae44d214 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -327,18 +327,22 @@ ip_vs_use_count_dec(void) /* Service hashing: * Operation Locking order * --------------------------------------------------------------------------- - * add table service_mutex, svc_resize_sem(W) - * del table service_mutex - * move between tables svc_resize_sem(W), seqcount_t(W), bit lock - * add/del service service_mutex, bit lock + * add first table service_mutex + * attach new table service_mutex + * add/del service service_mutex, RCU, bit lock + * move between tables (rehash) svc_resize_sem(W), seqcount_t(W), bit lock + * replace old with attached svc_resize_sem(W), svc_replace_sem(W) * find service RCU, seqcount_t(R) * walk services(blocking) service_mutex, svc_resize_sem(R) * walk services(non-blocking) RCU, seqcount_t(R) + * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R) + * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R) + * del table service_mutex after stopped work * - * - new tables are linked/unlinked under service_mutex and svc_resize_sem - * - new table is linked on resizing and all operations can run in parallel - * in 2 tables until the new table is registered as current one - * - two contexts can modify buckets: config and table resize, both in + * - new table is attached on resizing under service_mutex and all operations + * can run in parallel in 2 tables until the new table is registered as current + * one + * - two contexts can modify buckets: config and table resize (work), both in * process context * - only table resizer can move entries, so we do not protect t->seqc[] * items with t->lock[] @@ -346,9 +350,13 @@ ip_vs_use_count_dec(void) * services are moved to new table * - move operations may disturb readers: find operation will not miss entries * but walkers may see same entry twice if they are forced to retry chains - * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold - * service_mutex to disallow new tables to be installed or to check + * or to walk the newly attached second table + * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to check * svc_table_changes and repeat the RCU read section if new table is installed + * - walkers may serialize with the whole resizing process (svc_resize_sem) + * to prevent seeing same service twice or just with the svc_table + * replace (svc_replace_sem) when we can see entries twice but we + * prefer to run concurrently with the rehashing. */ /* @@ -387,9 +395,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* increase its refcnt because it is referenced by the svc table */ atomic_inc(&svc->refcnt); + /* We know if new table is attached under service_mutex but rely on + * RCU to hold the old table to be freed in resizer + */ + rcu_read_lock(); + + /* This can be the old or the new table */ + t = rcu_dereference(ipvs->svc_table); + /* New entries go into recent table */ - t = rcu_dereference_protected(ipvs->svc_table, 1); - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); if (svc->fwmark == 0) { /* @@ -410,6 +425,8 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) hlist_bl_add_head_rcu(&svc->s_list, head); hlist_bl_unlock(head); + rcu_read_unlock(); + return 1; } @@ -432,7 +449,13 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) return 0; } - t = rcu_dereference_protected(ipvs->svc_table, 1); + /* We know if new table is attached under service_mutex but rely on + * RCU to hold the old table to be freed in resizer + */ + rcu_read_lock(); + + /* This can be the old or the new table */ + t = rcu_dereference(ipvs->svc_table); hash_key = READ_ONCE(svc->hash_key); /* We need to lock the bucket in the right table */ if (ip_vs_rht_same_table(t, hash_key)) { @@ -443,13 +466,13 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) /* Moved to new table ? */ if (hash_key != hash_key2) { hlist_bl_unlock(head); - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); head = t->buckets + (hash_key2 & t->mask); hlist_bl_lock(head); } } else { /* It is already moved to new table */ - t = rcu_dereference_protected(t->new_tbl, 1); + t = rcu_dereference(t->new_tbl); head = t->buckets + (hash_key & t->mask); hlist_bl_lock(head); } @@ -459,6 +482,8 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); hlist_bl_unlock(head); + + rcu_read_unlock(); return 1; } @@ -666,15 +691,14 @@ static void svc_resize_work_handler(struct work_struct *work) goto unlock_sem; more_work = false; clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + if (!READ_ONCE(ipvs->enable)) goto unlock_m; t = rcu_dereference_protected(ipvs->svc_table, 1); /* Do nothing if table is removed */ if (!t) goto unlock_m; - /* New table needs to be registered? BUG! */ - if (t != rcu_dereference_protected(t->new_tbl, 1)) + /* New table already attached? BUG! */ + if (t != rcu_access_pointer(t->new_tbl)) goto unlock_m; lfactor = sysctl_svc_lfactor(ipvs); @@ -691,6 +715,7 @@ static void svc_resize_work_handler(struct work_struct *work) /* Flip the table_id */ t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; + /* Attach new table */ rcu_assign_pointer(t->new_tbl, t_new); /* Allow add/del to new_tbl while moving from old table */ mutex_unlock(&ipvs->service_mutex); @@ -698,8 +723,8 @@ static void svc_resize_work_handler(struct work_struct *work) ip_vs_rht_for_each_bucket(t, bucket, head) { same_bucket: if (++limit >= 16) { - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, + /* Check if work is stopped */ + if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) goto unlock_sem; if (resched_score >= 100) { @@ -764,16 +789,12 @@ same_bucket: goto same_bucket; } - /* Tables can be switched only under service_mutex */ - while (!mutex_trylock(&ipvs->service_mutex)) { - cond_resched(); - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) - goto unlock_sem; - } - if (!READ_ONCE(ipvs->enable) || - test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) - goto unlock_m; + /* Serialize with readers that don't like svc_table changes */ + down_write(&ipvs->svc_replace_sem); + + /* Check if work is stopped to avoid synchronize_rcu() */ + if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) + goto unlock_repl; rcu_assign_pointer(ipvs->svc_table, t_new); /* Inform readers that new table is installed */ @@ -781,8 +802,8 @@ same_bucket: atomic_inc(&ipvs->svc_table_changes); t_free = t; -unlock_m: - mutex_unlock(&ipvs->service_mutex); +unlock_repl: + up_write(&ipvs->svc_replace_sem); unlock_sem: up_write(&ipvs->svc_resize_sem); @@ -801,6 +822,11 @@ out: test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) return; queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); + return; + +unlock_m: + mutex_unlock(&ipvs->service_mutex); + goto unlock_sem; } static inline void @@ -1691,6 +1717,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_pe *pe = NULL; int ret_hooks = -1; int ret = 0; + bool grow; /* increase the module use count */ if (!ip_vs_use_count_inc()) @@ -1732,16 +1759,25 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, } #endif - t = rcu_dereference_protected(ipvs->svc_table, 1); + /* The old table can be freed, protect it with RCU */ + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); if (!t) { int lfactor = sysctl_svc_lfactor(ipvs); int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); + rcu_read_unlock(); t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); if (!t_new) { ret = -ENOMEM; goto out_err; } + grow = false; + } else { + /* Even the currently attached new table may need to grow */ + t = rcu_dereference(t->new_tbl); + grow = ip_vs_get_num_services(ipvs) + 1 > t->u_thresh; + rcu_read_unlock(); } if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { @@ -1800,6 +1836,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, goto out_err; if (t_new) { + /* Add table for first time */ clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); rcu_assign_pointer(ipvs->svc_table, t_new); t_new = NULL; @@ -1831,8 +1868,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ip_vs_svc_hash(svc); /* Schedule resize work */ - if (t && ip_vs_get_num_services(ipvs) > t->u_thresh && - !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) + if (grow && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); @@ -2054,7 +2090,6 @@ static int ip_vs_del_service(struct ip_vs_service *svc) return -EEXIST; ipvs = svc->ipvs; ip_vs_unlink_service(svc, false); - t = rcu_dereference_protected(ipvs->svc_table, 1); /* Drop the table if no more services */ ns = ip_vs_get_num_services(ipvs); @@ -2062,6 +2097,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* Stop the resizer and drop the tables */ set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); cancel_delayed_work_sync(&ipvs->svc_resize_work); + t = rcu_dereference_protected(ipvs->svc_table, 1); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); /* Inform readers that table is removed */ @@ -2075,11 +2111,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc) t = p; } } - } else if (ns <= t->l_thresh && - !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, - &ipvs->work_flags)) { - queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, - 1); + } else { + bool shrink; + + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); + /* Even the currently attached new table may need to shrink */ + t = rcu_dereference(t->new_tbl); + shrink = ns <= t->l_thresh; + rcu_read_unlock(); + if (shrink && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, + &ipvs->work_flags)) + queue_delayed_work(system_unbound_wq, + &ipvs->svc_resize_work, 1); } return 0; } @@ -2184,17 +2228,21 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, struct ip_vs_service *svc; struct hlist_bl_node *e; struct ip_vs_dest *dest; - int old_gen, new_gen; + int old_gen; if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + /* Allow concurrent rehashing on resize but to avoid loop + * serialize with installing the new table. + */ + down_read(&ipvs->svc_replace_sem); + old_gen = atomic_read(&ipvs->svc_table_changes); rcu_read_lock(); -repeat: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { @@ -2207,17 +2255,17 @@ repeat: } resched_score++; if (resched_score >= 100) { - resched_score = 0; cond_resched_rcu(); - new_gen = atomic_read(&ipvs->svc_table_changes); - /* New table installed ? */ - if (old_gen != new_gen) { - old_gen = new_gen; - goto repeat; - } + /* Flushed? So no more dev refs */ + if (atomic_read(&ipvs->svc_table_changes) != old_gen) + goto done; + resched_score = 0; } } + +done: rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); return NOTIFY_DONE; } @@ -2244,6 +2292,10 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) struct ip_vs_service *svc; struct hlist_bl_node *e; + /* svc_table can not be replaced (svc_replace_sem) or + * removed (service_mutex) + */ + down_read(&ipvs->svc_replace_sem); rcu_read_lock(); ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { @@ -2259,6 +2311,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) } rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; @@ -3062,6 +3115,7 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) u32 sum; int i; + /* Info for conns */ rcu_read_lock(); t = rcu_dereference(ipvs->conn_tab); @@ -3123,6 +3177,12 @@ repeat_conn: } after_conns: + rcu_read_unlock(); + + /* Info for services */ + down_read(&ipvs->svc_replace_sem); + rcu_read_lock(); + t = rcu_dereference(ipvs->svc_table); count = ip_vs_get_num_services(ipvs); @@ -3133,9 +3193,7 @@ after_conns: if (!count) goto after_svc; old_gen = atomic_read(&ipvs->svc_table_changes); - loops = 0; -repeat_svc: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ memset(counts, 0, sizeof(counts)); ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) { @@ -3157,15 +3215,10 @@ repeat_svc: if (resched_score >= 100) { resched_score = 0; cond_resched_rcu(); - new_gen = atomic_read(&ipvs->svc_table_changes); - /* New table installed ? */ - if (old_gen != new_gen) { - /* Too many changes? */ - if (++loops >= 5) - goto after_svc; - old_gen = new_gen; - goto repeat_svc; - } + /* Flushed? */ + if (atomic_read(&ipvs->svc_table_changes) != + old_gen) + goto after_svc; } counts[count]++; } @@ -3184,6 +3237,9 @@ repeat_svc: } after_svc: + rcu_read_unlock(); + up_read(&ipvs->svc_replace_sem); + seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n", ipvs->est_kt_count, ipvs->est_max_threads); seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max); @@ -3191,7 +3247,6 @@ after_svc: ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR * IPVS_EST_NTICKS); - rcu_read_unlock(); return 0; } @@ -3503,7 +3558,7 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs, int ret = 0; lockdep_assert_held(&ipvs->svc_resize_sem); - /* All service modifications are disabled, go ahead */ + /* All svc_table modifications are disabled, go ahead */ ip_vs_rht_walk_buckets(ipvs->svc_table, head) { hlist_bl_for_each_entry(svc, e, head, s_list) { /* Only expose IPv4 entries to old interface */ @@ -3687,7 +3742,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) pr_err("length: %u != %zu\n", *len, size); return -EINVAL; } - /* Protect against table resizer moving the entries. + /* Prevent modifications to the list with services. * Try reverse locking, so that we do not hold the mutex * while waiting for semaphore. */ @@ -4029,6 +4084,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int start = cb->args[0]; int idx = 0; + /* Make sure we do not see same service twice during resize */ down_read(&ipvs->svc_resize_sem); rcu_read_lock(); ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { @@ -5072,6 +5128,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) /* Initialize service_mutex, svc_table per netns */ __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); init_rwsem(&ipvs->svc_resize_sem); + init_rwsem(&ipvs->svc_replace_sem); INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); atomic_set(&ipvs->svc_table_changes, 0); RCU_INIT_POINTER(ipvs->svc_table, NULL); -- cgit v1.2.3 From b2870fc21601db9133bc70c48c603b487614fa3b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 14 May 2026 16:46:38 +0200 Subject: netfilter: br_netfilter: Reallocate headroom if necessary in neigh_hh_bridge() neigh_hh_bridge() assumes the skb always has sufficient headroom to copy the aligned L2 header. This assumption can trigger the crash reported below using the following netfilter setup: $modprobe br_netfilter $sysctl -w net.bridge.bridge-nf-call-iptables=1 $root@OpenWrt:~# nft list ruleset table ip nat { chain prerouting { type nat hook prerouting priority dstnat; policy accept; ip daddr 192.168.83.123 dnat to 192.168.83.120 } } - iperf3 client (192.168.83.119) --> bridge (192.168.83.118) --> iperf3 server (192.168.83.120) the iperf3 client is sending packet for 192.168.83.123 to the bridge device. [ 1579.036575] Unable to handle kernel write to read-only memory at virtual address ffffff8004d76ffe [ 1579.045482] Mem abort info: [ 1579.048273] ESR = 0x000000009600004f [ 1579.052024] EC = 0x25: DABT (current EL), IL = 32 bits [ 1579.057363] SET = 0, FnV = 0 [ 1579.060417] EA = 0, S1PTW = 0 [ 1579.063550] FSC = 0x0f: level 3 permission fault [ 1579.068345] Data abort info: [ 1579.071224] ISV = 0, ISS = 0x0000004f, ISS2 = 0x00000000 [ 1579.076720] CM = 0, WnR = 1, TnD = 0, TagAccess = 0 [ 1579.081770] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 1579.087092] swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000080dc4000 [ 1579.093794] [ffffff8004d76ffe] pgd=180000009ffff003, p4d=180000009ffff003, pud=180000009ffff003, pmd=180000009ffe3003, pte=0060000084d76787 [ 1579.106343] Internal error: Oops: 000000009600004f [#1] SMP [ 1579.193824] CPU: 0 UID: 0 PID: 235 Comm: napi/qdma_eth-3 Tainted: G O 6.12.57 #0 [ 1579.202614] Tainted: [O]=OOT_MODULE [ 1579.206102] Hardware name: Airoha AN7581 Evaluation Board (DT) [ 1579.211929] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 1579.218889] pc : br_nf_pre_routing_finish_bridge+0x1ac/0xcc8 [br_netfilter] [ 1579.225859] lr : br_nf_pre_routing_finish_bridge+0x18c/0xcc8 [br_netfilter] [ 1579.232822] sp : ffffffc0817cba20 [ 1579.236128] x29: ffffffc0817cba20 x28: 0000000000000000 x27: ffffff8002b89000 [ 1579.243273] x26: ffffff8004d7700e x25: 0000000000000008 x24: 0000000000000000 [ 1579.250416] x23: ffffffc08179d4c0 x22: 0000000000000000 x21: ffffffc08179d4c0 [ 1579.257561] x20: ffffff8004d9b800 x19: ffffff8015010000 x18: 0000000000000014 [ 1579.264704] x17: ffffffbf9e930000 x16: ffffffc0817c8000 x15: 0000000000000070 [ 1579.271848] x14: 0000000000000080 x13: 0000000000000001 x12: 0000000000000000 [ 1579.278993] x11: ffffffc0798caae0 x10: ffffff8014db6fd8 x9 : 0000000000000000 [ 1579.286136] x8 : 0000000000000003 x7 : ffffffc08171f628 x6 : 000000001a3b83d3 [ 1579.293281] x5 : 0000000000000000 x4 : 1beb76f22fee0000 x3 : ffffff8004d7700e [ 1579.300425] x2 : 0000000000000000 x1 : ffffff8004d9b8bc x0 : ffffff80026ed000 [ 1579.307570] Call trace: [ 1579.310018] br_nf_pre_routing_finish_bridge+0x1ac/0xcc8 [br_netfilter] [ 1579.316632] br_nf_hook_thresh+0xd4/0x14bc [br_netfilter] [ 1579.322032] br_nf_hook_thresh+0x250/0x14bc [br_netfilter] [ 1579.327517] br_nf_hook_thresh+0x76c/0x14bc [br_netfilter] [ 1579.333003] br_handle_frame+0x180/0x480 [ 1579.336935] __netif_receive_skb_core.constprop.0+0x540/0xf40 [ 1579.342682] __netif_receive_skb_one_core+0x28/0x50 [ 1579.347561] process_backlog+0x98/0x1e0 [ 1579.351398] __napi_poll+0x34/0x1c4 [ 1579.354887] net_rx_action+0x178/0x330 [ 1579.358638] handle_softirqs+0x108/0x2d4 [ 1579.362560] __do_softirq+0x10/0x18 [ 1579.366051] ____do_softirq+0xc/0x20 [ 1579.369627] call_on_irq_stack+0x30/0x4c [ 1579.373550] do_softirq_own_stack+0x18/0x20 [ 1579.377734] do_softirq+0x4c/0x60 [ 1579.381050] __local_bh_enable_ip+0x88/0x98 [ 1579.385234] napi_threaded_poll_loop+0x188/0x21c [ 1579.389853] napi_threaded_poll+0x70/0x80 [ 1579.393863] kthread+0xd8/0xdc [ 1579.396918] ret_from_fork+0x10/0x20 [ 1579.400499] Code: 88dffc22 3707ffc2 f9406663 f9406684 (f81f0064) [ 1579.406589] ---[ end trace 0000000000000000 ]--- [ 1579.411209] Kernel panic - not syncing: Oops: Fatal exception in interrupt [ 1579.418083] SMP: stopping secondary CPUs [ 1579.422012] Kernel Offset: disabled Fix the issue reallocating the skb headroom if necessary in neigh_hh_bridge routine. Fixes: e179e6322ac33 ("netfilter: bridge-netfilter: Fix MAC header handling with IP DNAT") Reviewed-by: Ido Schimmel Signed-off-by: Lorenzo Bianconi Signed-off-by: Pablo Neira Ayuso --- include/net/neighbour.h | 8 ++++++-- net/bridge/br_netfilter_hooks.c | 6 +++++- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 2dfee6d4258a..8860cc2175fc 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -489,11 +489,15 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { - unsigned int seq, hh_alen; + unsigned int seq, hh_alen = HH_DATA_ALIGN(ETH_HLEN); + int err; + + err = skb_cow_head(skb, hh_alen); + if (err) + return err; do { seq = read_seqbegin(&hh->hh_lock); - hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 0ab1c94db4b9..0a394e5f4391 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -297,7 +297,11 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_ goto free_skb; } - neigh_hh_bridge(&neigh->hh, skb); + if (neigh_hh_bridge(&neigh->hh, skb)) { + neigh_release(neigh); + goto free_skb; + } + skb->dev = br_indev; ret = br_handle_frame_finish(net, sk, skb); -- cgit v1.2.3 From e196115ec330a18de415bdb9f5071aa9f08e53ce Mon Sep 17 00:00:00 2001 From: Haoze Xie Date: Fri, 15 May 2026 11:19:02 +0800 Subject: netfilter: nf_queue: hold bridge skb->dev while queued br_pass_frame_up() rewrites skb->dev from the ingress port to the bridge master before queueing bridge LOCAL_IN packets. NFQUEUE only holds references on state.in/out and bridge physdevs, so a queued bridge packet can retain a freed bridge master in skb->dev until reinjection. When the verdict is reinjected later, br_netif_receive_skb() re-enters the receive path with skb->dev still pointing at the freed bridge master, triggering a use-after-free. Store skb->dev in the queue entry, hold a reference on it for the queue lifetime, and use the saved device when dropping queued packets during NETDEV_DOWN handling. Fixes: ac2863445686 ("netfilter: bridge: add nf_afinfo to enable queuing to userspace") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Haoze Xie Signed-off-by: Ren Wei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 1 + net/netfilter/nf_queue.c | 4 +++- net/netfilter/nfnetlink_queue.c | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index d17035d14d96..3978c3174cdb 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -14,6 +14,7 @@ struct nf_queue_entry { struct list_head list; struct rhash_head hash_node; struct sk_buff *skb; + struct net_device *skb_dev; unsigned int id; unsigned int hook_index; /* index in hook_entries->hook[] */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index a6c81c04b3a5..57b450024a99 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -61,6 +61,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) struct nf_hook_state *state = &entry->state; /* Release those devices we held, or Alexey will kill me. */ + dev_put(entry->skb_dev); dev_put(state->in); dev_put(state->out); if (state->sk) @@ -102,6 +103,7 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) return false; + dev_hold(entry->skb_dev); dev_hold(state->in); dev_hold(state->out); @@ -202,11 +204,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, *entry = (struct nf_queue_entry) { .skb = skb, + .skb_dev = skb->dev, .state = *state, .hook_index = index, .size = sizeof(*entry) + route_key_size, }; - __nf_queue_entry_init_physdevs(entry); if (!nf_queue_entry_get_refs(entry)) { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 58304fd1f70f..984a0eb9e149 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1212,6 +1212,8 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) if (physinif == ifindex || physoutif == ifindex) return 1; #endif + if (entry->skb_dev && entry->skb_dev->ifindex == ifindex) + return 1; if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; -- cgit v1.2.3 From 3d562d35a044ae798cab421c65a116f8cedfa5d4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 17 May 2026 09:55:28 +0200 Subject: bpf: Check global subprog exception paths Global subprogs are verified independently and are not descended into when their callers are symbolically executed. This means a caller can hold references or locks across a global subprog call that may throw, while the verifier only checks the non-exceptional return path at the call site. Record whether a subprog might throw in the CFG summary pass, alongside the existing might_sleep and packet-data-changing summaries, and propagate that effect through reachable callees. When a global subprog is marked as possibly throwing, push the normal continuation and validate the exceptional path immediately at the call site, avoiding a synthetic exception state and associated special case in the pruning checks. Fixes: f18b03fabaa9 ("bpf: Implement BPF exceptions") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260517075530.3461166-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/cfg.c | 13 ++++++++++++- kernel/bpf/verifier.c | 23 +++++++++++++++++------ 3 files changed, 31 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b148f816f25b..185b2aa43a42 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -729,6 +729,7 @@ struct bpf_subprog_info { */ s16 fastcall_stack_off; bool has_tail_call: 1; + bool might_throw: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; bool is_cb: 1; @@ -1308,6 +1309,7 @@ void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog); int bpf_find_subprog(struct bpf_verifier_env *env, int off); +bool bpf_is_throw_kfunc(struct bpf_insn *insn); int bpf_compute_const_regs(struct bpf_verifier_env *env); int bpf_prune_dead_branches(struct bpf_verifier_env *env); int bpf_check_cfg(struct bpf_verifier_env *env); diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c index 998f42a8189a..26d37066465f 100644 --- a/kernel/bpf/cfg.c +++ b/kernel/bpf/cfg.c @@ -64,11 +64,19 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) subprog->might_sleep = true; } +static void mark_subprog_might_throw(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = bpf_find_containing_subprog(env, off); + subprog->might_throw = true; +} + /* 't' is an index of a call-site. * 'w' is a callee entry point. * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. * Rely on DFS traversal order and absence of recursive calls to guarantee that - * callee's change_pkt_data marks would be correct at that moment. + * callee's effect marks would be correct at that moment. */ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) { @@ -78,6 +86,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) callee = bpf_find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; caller->might_sleep |= callee->might_sleep; + caller->might_throw |= callee->might_throw; } enum { @@ -509,6 +518,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_subprog_might_sleep(env, t); if (ret == 0 && bpf_is_kfunc_pkt_changing(&meta)) mark_subprog_changes_pkt_data(env, t); + if (ret == 0 && bpf_is_throw_kfunc(insn)) + mark_subprog_might_throw(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 88b40c979b56..7fb88e1cd7c4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -442,7 +442,6 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) static bool is_sync_callback_calling_kfunc(u32 btf_id); static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); -static bool is_bpf_throw_kfunc(struct bpf_insn *insn); static bool is_bpf_wq_set_callback_kfunc(u32 btf_id); static bool is_task_work_add_kfunc(u32 func_id); @@ -5405,7 +5404,7 @@ continue_func: if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) { bool err = false; - if (!is_bpf_throw_kfunc(insn + i)) + if (!bpf_is_throw_kfunc(insn + i)) continue; for (tmp = idx; tmp >= 0 && !err; tmp = dinfo[tmp].caller) { if (subprog[tmp].is_cb) { @@ -9499,6 +9498,9 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins return 0; } +static int process_bpf_exit_full(struct bpf_verifier_env *env, + bool *do_print_state, bool exception_exit); + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -9552,6 +9554,17 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; } + if (env->subprog_info[subprog].might_throw) { + struct bpf_verifier_state *branch; + + branch = push_stack(env, *insn_idx + 1, *insn_idx, false); + if (IS_ERR(branch)) { + verbose(env, "failed to push state for global subprog exception path\n"); + return PTR_ERR(branch); + } + return process_bpf_exit_full(env, NULL, true); + } + /* continue with next insn after call */ return 0; } @@ -11782,7 +11795,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id) is_task_work_add_kfunc(btf_id); } -static bool is_bpf_throw_kfunc(struct bpf_insn *insn) +bool bpf_is_throw_kfunc(struct bpf_insn *insn) { return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && insn->imm == special_kfunc_list[KF_bpf_throw]; @@ -12972,8 +12985,6 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); -static int process_bpf_exit_full(struct bpf_verifier_env *env, - bool *do_print_state, bool exception_exit); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) @@ -13354,7 +13365,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) env->prog->call_session_cookie = true; - if (is_bpf_throw_kfunc(insn)) + if (bpf_is_throw_kfunc(insn)) return process_bpf_exit_full(env, NULL, true); return 0; -- cgit v1.2.3 From f233124fb36cd57ef09f96d517a38ab4b902e15e Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 14 May 2026 09:39:01 +0200 Subject: ata: libata-scsi: do not use the deferred QC feature on PMPs with CBS When using Port Multipliers (PMPs) with Command-Based Switching (CBS), you can only issue commands to one link at a time. For PMPs with CBS, there is already code to handle commands being sent to different links in sata_pmp_qc_defer_cmd_switch() using ap->excl_link. sata_sil24 also makes use of ap->excl_link. A user on the list reported that commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") broke PMPs with CBS. The commit introduced code that stores a deferred qc in ap->deferred_qc, to later be issued via a workqueue. It turns out that this change is incompatible with the existing ap->excl_link handling used by PMPs with CBS. Thus, modify sata_pmp_qc_defer_cmd_switch() and sil24_qc_defer() to return ATA_DEFER_LINK_EXCL, and make sure that the deferred QC handling via workqueue is not used for this return value. This way, PMPs with CBS will work once again. Note that the starvation referenced in commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") can only happen on libsas ports, and libsas does not support Port Multipliers, thus there is no harm of reverting back to the previous way of deferring commands for PMPs with CBS. Non-libsas ports connected to anything but a PMP with CBS (e.g. a normal drive or a PMP with FBS) will continue using the deferred workqueue, since it does result in lower completion latencies for non-NCQ commands, even though the workqueue is not strictly needed to avoid starvation for non-libsas ports. If we want to modify the scope of the workqueue issuing to also handle PMPs with CBS, then we should ensure that we can save both NCQ and non-NCQ commands in ap->deferred_qc, while also removing the existing PMP CBS handling using ap->excl_link, such that we don't duplicate features. While at it, also add a comment explaining how the ap->excl_link mechanism works. Fixes: 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") Tested-by: Tommy Kelly Reported-by: Tommy Kelly Closes: https://lore.kernel.org/linux-ide/ce09cc21-a8e9-4845-b205-35411e22fba9@tkel.ly/ Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-pmp.c | 13 ++++++++++++- drivers/ata/libata-scsi.c | 8 ++++++++ drivers/ata/sata_sil24.c | 6 +++++- include/linux/libata.h | 1 + 4 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c index e3adc008fed1..7e889534d73b 100644 --- a/drivers/ata/libata-pmp.c +++ b/drivers/ata/libata-pmp.c @@ -110,13 +110,24 @@ int sata_pmp_qc_defer_cmd_switch(struct ata_queued_cmd *qc) { struct ata_link *link = qc->dev->link; struct ata_port *ap = link->ap; + int ret; if (ap->excl_link == NULL || ap->excl_link == link) { if (ap->nr_active_links == 0 || ata_link_active(link)) { qc->flags |= ATA_QCFLAG_CLEAR_EXCL; - return ata_std_qc_defer(qc); + ret = ata_std_qc_defer(qc); + if (ret == ATA_DEFER_LINK) + return ATA_DEFER_LINK_EXCL; + return ret; } + /* + * Note: ap->excl_link contains the link that is next in line, + * i.e. implicit round robin. If there is only one link + * dispatching, ap->excl_link will be left unclaimed, allowing + * other links to set ap->excl_link, ensuring that the currently + * active link cannot queue any more. + */ ap->excl_link = link; } diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index f03b6326ad2d..ca29744c57f9 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1787,6 +1787,14 @@ static int ata_scsi_qc_issue(struct ata_port *ap, struct ata_queued_cmd *qc) case ATA_DEFER_LINK: ret = SCSI_MLQUEUE_DEVICE_BUSY; goto defer_qc; + case ATA_DEFER_LINK_EXCL: + /* + * Drivers making use of ap->excl_link cannot store the QC in + * ap->deferred_qc, because the ap->excl_link handling is + * incompatible with the ap->deferred_qc workqueue handling. + */ + ret = SCSI_MLQUEUE_DEVICE_BUSY; + goto free_qc; case ATA_DEFER_PORT: ret = SCSI_MLQUEUE_HOST_BUSY; goto free_qc; diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c index d642ece9f07a..57f1081b86db 100644 --- a/drivers/ata/sata_sil24.c +++ b/drivers/ata/sata_sil24.c @@ -789,6 +789,7 @@ static int sil24_qc_defer(struct ata_queued_cmd *qc) struct ata_link *link = qc->dev->link; struct ata_port *ap = link->ap; u8 prot = qc->tf.protocol; + int ret; /* * There is a bug in the chip: @@ -826,7 +827,10 @@ static int sil24_qc_defer(struct ata_queued_cmd *qc) qc->flags |= ATA_QCFLAG_CLEAR_EXCL; } - return ata_std_qc_defer(qc); + ret = ata_std_qc_defer(qc); + if (ret == ATA_DEFER_LINK) + return ATA_DEFER_LINK_EXCL; + return ret; } static enum ata_completion_errors sil24_qc_prep(struct ata_queued_cmd *qc) diff --git a/include/linux/libata.h b/include/linux/libata.h index 5c085ef4eda7..360776016b50 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -371,6 +371,7 @@ enum { /* return values for ->qc_defer */ ATA_DEFER_LINK = 1, ATA_DEFER_PORT = 2, + ATA_DEFER_LINK_EXCL = 3, /* desc_len for ata_eh_info and context */ ATA_EH_DESC_LEN = 80, -- cgit v1.2.3 From 759e8756da00aa115d504a18155b1d1ee1cc12e8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 14 May 2026 09:39:02 +0200 Subject: ata: libata-scsi: do not needlessly defer commands when using PMP with FBS The ACS specification does not allow a non-NCQ command to be issued while an NCQ command is outstanding. Commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") introduced a feature where a deferred non-NCQ command gets issued from a workqueue. The design stores a single non-NCQ command per port. However, when using Port Multipliers (PMPs), specifically PMPs that support FIS-Based Switching (FBS), non-NCQ and NCQ commands can be mixed on the same port, just not for the same link, see e.g. ata_std_qc_defer() which is, and always has operated on a per-link basis. Therefore, move the deferred_qc from struct ata_port to struct ata_link. This way, when using a PMP with FBS, we will not needlessly defer commands to all other links, just because one link issued a non-NCQ command while having an NCQ command outstanding. Only commands for that specific link will be deferred. This is in line with how PMPs with FBS worked before commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation"). Fixes: 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") Tested-by: Tommy Kelly Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 9 ++++++--- drivers/ata/libata-eh.c | 8 ++++---- drivers/ata/libata-pmp.c | 5 ++++- drivers/ata/libata-scsi.c | 43 +++++++++++++++++++++++++------------------ include/linux/libata.h | 6 +++--- 5 files changed, 42 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index e76d15411e2a..3d0027ec33c2 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5584,6 +5584,7 @@ void ata_link_init(struct ata_port *ap, struct ata_link *link, int pmp) link->pmp = pmp; link->active_tag = ATA_TAG_POISON; link->hw_sata_spd_limit = UINT_MAX; + INIT_WORK(&link->deferred_qc_work, ata_scsi_deferred_qc_work); /* can't use iterator, ap isn't initialized yet */ for (i = 0; i < ATA_MAX_DEVICES; i++) { @@ -5666,7 +5667,6 @@ struct ata_port *ata_port_alloc(struct ata_host *host) mutex_init(&ap->scsi_scan_mutex); INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug); INIT_DELAYED_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan); - INIT_WORK(&ap->deferred_qc_work, ata_scsi_deferred_qc_work); INIT_LIST_HEAD(&ap->eh_done_q); init_waitqueue_head(&ap->eh_wait_q); init_completion(&ap->park_req_pending); @@ -6291,12 +6291,15 @@ static void ata_port_detach(struct ata_port *ap) /* It better be dead now and not have any remaining deferred qc. */ WARN_ON(!(ap->pflags & ATA_PFLAG_UNLOADED)); - WARN_ON(ap->deferred_qc); - cancel_work_sync(&ap->deferred_qc_work); cancel_delayed_work_sync(&ap->hotplug_task); cancel_delayed_work_sync(&ap->scsi_rescan_task); + ata_for_each_link(link, ap, PMP_FIRST) { + WARN_ON(link->deferred_qc); + cancel_work_sync(&link->deferred_qc_work); + } + /* Delete port multiplier link transport devices */ if (ap->pmp_link) { int i; diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 9a4b67b90b17..d623eb32ed8b 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -651,11 +651,11 @@ int ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, if (qc->scsicmd != scmd) continue; if ((qc->flags & ATA_QCFLAG_ACTIVE) || - qc == ap->deferred_qc) + qc == qc->dev->link->deferred_qc) break; } - if (i < ATA_MAX_QUEUE && qc == ap->deferred_qc) { + if (i < ATA_MAX_QUEUE && qc == qc->dev->link->deferred_qc) { /* * This is a deferred command that timed out while * waiting for the command queue to drain. Since the qc @@ -666,8 +666,8 @@ int ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, * deferred qc work from issuing this qc. */ WARN_ON_ONCE(qc->flags & ATA_QCFLAG_ACTIVE); - ap->deferred_qc = NULL; - cancel_work(&ap->deferred_qc_work); + qc->dev->link->deferred_qc = NULL; + cancel_work(&qc->dev->link->deferred_qc_work); set_host_byte(scmd, DID_TIME_OUT); scsi_eh_finish_cmd(scmd, &ap->eh_done_q); } else if (i < ATA_MAX_QUEUE) { diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c index 7e889534d73b..e8540931b4a1 100644 --- a/drivers/ata/libata-pmp.c +++ b/drivers/ata/libata-pmp.c @@ -582,8 +582,11 @@ static void sata_pmp_detach(struct ata_device *dev) if (ap->ops->pmp_detach) ap->ops->pmp_detach(ap); - ata_for_each_link(tlink, ap, EDGE) + ata_for_each_link(tlink, ap, EDGE) { + WARN_ON(tlink->deferred_qc); + cancel_work_sync(&tlink->deferred_qc_work); ata_eh_detach_dev(tlink->device); + } spin_lock_irqsave(ap->lock, flags); ap->nr_pmp_links = 0; diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index ca29744c57f9..d43207c6e467 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1664,8 +1664,9 @@ static void ata_scsi_qc_done(struct ata_queued_cmd *qc, bool set_result, void ata_scsi_deferred_qc_work(struct work_struct *work) { - struct ata_port *ap = - container_of(work, struct ata_port, deferred_qc_work); + struct ata_link *link = + container_of(work, struct ata_link, deferred_qc_work); + struct ata_port *ap = link->ap; struct ata_queued_cmd *qc; unsigned long flags; @@ -1676,10 +1677,10 @@ void ata_scsi_deferred_qc_work(struct work_struct *work) * such case, we should not need any more deferring the qc, so warn if * qc_defer() says otherwise. */ - qc = ap->deferred_qc; + qc = link->deferred_qc; if (qc && !ata_port_eh_scheduled(ap)) { WARN_ON_ONCE(ap->ops->qc_defer(qc)); - ap->deferred_qc = NULL; + link->deferred_qc = NULL; ata_qc_issue(qc); } @@ -1688,7 +1689,7 @@ void ata_scsi_deferred_qc_work(struct work_struct *work) void ata_scsi_requeue_deferred_qc(struct ata_port *ap) { - struct ata_queued_cmd *qc = ap->deferred_qc; + struct ata_link *link; lockdep_assert_held(ap->lock); @@ -1697,16 +1698,21 @@ void ata_scsi_requeue_deferred_qc(struct ata_port *ap) * do not try to be smart about what to do with this deferred command * and simply requeue it by completing it with DID_REQUEUE. */ - if (qc) { - ap->deferred_qc = NULL; - cancel_work(&ap->deferred_qc_work); - ata_scsi_qc_done(qc, true, DID_REQUEUE << 16); + ata_for_each_link(link, ap, PMP_FIRST) { + struct ata_queued_cmd *qc = link->deferred_qc; + + if (qc) { + link->deferred_qc = NULL; + cancel_work(&link->deferred_qc_work); + ata_scsi_qc_done(qc, true, DID_REQUEUE << 16); + } } } -static void ata_scsi_schedule_deferred_qc(struct ata_port *ap) +static void ata_scsi_schedule_deferred_qc(struct ata_link *link) { - struct ata_queued_cmd *qc = ap->deferred_qc; + struct ata_queued_cmd *qc = link->deferred_qc; + struct ata_port *ap = link->ap; lockdep_assert_held(ap->lock); @@ -1723,12 +1729,12 @@ static void ata_scsi_schedule_deferred_qc(struct ata_port *ap) return; } if (!ap->ops->qc_defer(qc)) - queue_work(system_highpri_wq, &ap->deferred_qc_work); + queue_work(system_highpri_wq, &link->deferred_qc_work); } static void ata_scsi_qc_complete(struct ata_queued_cmd *qc) { - struct ata_port *ap = qc->ap; + struct ata_link *link = qc->dev->link; struct scsi_cmnd *cmd = qc->scsicmd; u8 *cdb = cmd->cmnd; bool have_sense = qc->flags & ATA_QCFLAG_SENSE_VALID; @@ -1759,11 +1765,12 @@ static void ata_scsi_qc_complete(struct ata_queued_cmd *qc) ata_scsi_qc_done(qc, false, 0); - ata_scsi_schedule_deferred_qc(ap); + ata_scsi_schedule_deferred_qc(link); } static int ata_scsi_qc_issue(struct ata_port *ap, struct ata_queued_cmd *qc) { + struct ata_link *link = qc->dev->link; int ret; if (!ap->ops->qc_defer) @@ -1774,7 +1781,7 @@ static int ata_scsi_qc_issue(struct ata_port *ap, struct ata_queued_cmd *qc) * requeue and defer all incoming commands until the deferred qc is * processed, once all on-going commands complete. */ - if (ap->deferred_qc) { + if (link->deferred_qc) { ata_qc_free(qc); return SCSI_MLQUEUE_DEVICE_BUSY; } @@ -1790,8 +1797,8 @@ static int ata_scsi_qc_issue(struct ata_port *ap, struct ata_queued_cmd *qc) case ATA_DEFER_LINK_EXCL: /* * Drivers making use of ap->excl_link cannot store the QC in - * ap->deferred_qc, because the ap->excl_link handling is - * incompatible with the ap->deferred_qc workqueue handling. + * link->deferred_qc, because the ap->excl_link handling is + * incompatible with the link->deferred_qc workqueue handling. */ ret = SCSI_MLQUEUE_DEVICE_BUSY; goto free_qc; @@ -1817,7 +1824,7 @@ defer_qc: * commands complete. */ if (!ata_is_ncq(qc->tf.protocol)) { - ap->deferred_qc = qc; + link->deferred_qc = qc; return 0; } diff --git a/include/linux/libata.h b/include/linux/libata.h index 360776016b50..127229fbd1a6 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -855,6 +855,9 @@ struct ata_link { unsigned int sata_spd; /* current SATA PHY speed */ enum ata_lpm_policy lpm_policy; + struct work_struct deferred_qc_work; + struct ata_queued_cmd *deferred_qc; + /* record runtime error info, protected by host_set lock */ struct ata_eh_info eh_info; /* EH context */ @@ -900,9 +903,6 @@ struct ata_port { u64 qc_active; int nr_active_links; /* #links with active qcs */ - struct work_struct deferred_qc_work; - struct ata_queued_cmd *deferred_qc; - struct ata_link link; /* host default link */ struct ata_link *slave_link; /* see ata_slave_link_init() */ -- cgit v1.2.3 From 379e8f1ca5e919b130b40d8115d92a536e5f8d7a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 18 May 2026 13:41:45 +0200 Subject: drm/gem: Make the GEM LRU lock part of drm_device Recently, a few races have been discovered in the GEM LRU logic, all of them caused by the fact the LRU lock is accessed through gem->lru->lock, and that very same lock also protects changes to gem->lru, leading to situations where gem->lru needs to first be accessed without the lock held, to then get the lru to access the lock through and finally take the lock and do the expected operation. Currently, the only driver making use of this API (MSM) declares a device-wide lock, and the user we're about to add (panthor) will do the same. There's no evidence that we will ever have a driver that wants different pools of LRUs protected by different locks under the same drm_device. So we're better off moving this lock to drm_device and always locking it through obj->dev->gem_lru_mutex, or directly through dev->gem_lru_mutex. If anyone ever needs more fine-grained locking, this can be revisited to pass some drm_gem_lru_pool object representing the pool of LRUs under a specific lock, but for now, the per-device lock seems to be enough. Fixes: e7c2af13f811 ("drm/gem: Add LRU/shrinker helper") Reported-by: Chia-I Wu Closes: https://gitlab.freedesktop.org/panfrost/linux/-/work_items/86 Reviewed-by: Rob Clark Reviewed-by: Liviu Dudau Reviewed-by: Steven Price Reviewed-by: Chia-I Wu Link: https://patch.msgid.link/20260518-panthor-shrinker-fixes-v4-1-1920234470d5@collabora.com Signed-off-by: Boris Brezillon --- drivers/gpu/drm/drm_drv.c | 2 ++ drivers/gpu/drm/drm_gem.c | 36 +++++++++++++++------------------- drivers/gpu/drm/msm/msm_drv.c | 11 +++++------ drivers/gpu/drm/msm/msm_drv.h | 7 ------- drivers/gpu/drm/msm/msm_gem.c | 33 +++++++++++++++---------------- drivers/gpu/drm/msm/msm_gem_shrinker.c | 4 ++-- drivers/gpu/drm/msm/msm_gem_submit.c | 6 +++--- drivers/gpu/drm/msm/msm_gem_vma.c | 12 ++++++------ drivers/gpu/drm/msm/msm_ringbuffer.c | 6 +++--- include/drm/drm_device.h | 7 +++++++ include/drm/drm_gem.h | 20 +++++++++---------- 11 files changed, 69 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index 985c283cf59f..675675480da4 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -697,6 +697,7 @@ static void drm_dev_init_release(struct drm_device *dev, void *res) mutex_destroy(&dev->master_mutex); mutex_destroy(&dev->clientlist_mutex); mutex_destroy(&dev->filelist_mutex); + mutex_destroy(&dev->gem_lru_mutex); } static int drm_dev_init(struct drm_device *dev, @@ -738,6 +739,7 @@ static int drm_dev_init(struct drm_device *dev, INIT_LIST_HEAD(&dev->vblank_event_list); spin_lock_init(&dev->event_lock); + mutex_init(&dev->gem_lru_mutex); mutex_init(&dev->filelist_mutex); mutex_init(&dev->clientlist_mutex); mutex_init(&dev->master_mutex); diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index d6424267260b..b95b015d2983 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -1541,12 +1541,10 @@ EXPORT_SYMBOL(drm_gem_unlock_reservations); * drm_gem_lru_init - initialize a LRU * * @lru: The LRU to initialize - * @lock: The lock protecting the LRU */ void -drm_gem_lru_init(struct drm_gem_lru *lru, struct mutex *lock) +drm_gem_lru_init(struct drm_gem_lru *lru) { - lru->lock = lock; lru->count = 0; INIT_LIST_HEAD(&lru->list); } @@ -1571,14 +1569,10 @@ drm_gem_lru_remove_locked(struct drm_gem_object *obj) void drm_gem_lru_remove(struct drm_gem_object *obj) { - struct drm_gem_lru *lru = obj->lru; - - if (!lru) - return; - - mutex_lock(lru->lock); - drm_gem_lru_remove_locked(obj); - mutex_unlock(lru->lock); + mutex_lock(&obj->dev->gem_lru_mutex); + if (obj->lru) + drm_gem_lru_remove_locked(obj); + mutex_unlock(&obj->dev->gem_lru_mutex); } EXPORT_SYMBOL(drm_gem_lru_remove); @@ -1593,7 +1587,7 @@ EXPORT_SYMBOL(drm_gem_lru_remove); void drm_gem_lru_move_tail_locked(struct drm_gem_lru *lru, struct drm_gem_object *obj) { - lockdep_assert_held_once(lru->lock); + lockdep_assert_held_once(&obj->dev->gem_lru_mutex); if (obj->lru) drm_gem_lru_remove_locked(obj); @@ -1617,9 +1611,9 @@ EXPORT_SYMBOL(drm_gem_lru_move_tail_locked); void drm_gem_lru_move_tail(struct drm_gem_lru *lru, struct drm_gem_object *obj) { - mutex_lock(lru->lock); + mutex_lock(&obj->dev->gem_lru_mutex); drm_gem_lru_move_tail_locked(lru, obj); - mutex_unlock(lru->lock); + mutex_unlock(&obj->dev->gem_lru_mutex); } EXPORT_SYMBOL(drm_gem_lru_move_tail); @@ -1633,6 +1627,7 @@ EXPORT_SYMBOL(drm_gem_lru_move_tail); * of the shrink callback to check for this (ie. dma_resv_test_signaled()) * or if necessary block until the buffer becomes idle. * + * @dev: DRM device the LRU belongs to * @lru: The LRU to scan * @nr_to_scan: The number of pages to try to reclaim * @remaining: The number of pages left to reclaim, should be initialized by caller @@ -1640,7 +1635,8 @@ EXPORT_SYMBOL(drm_gem_lru_move_tail); * @ticket: Optional ww_acquire_ctx context to use for locking */ unsigned long -drm_gem_lru_scan(struct drm_gem_lru *lru, +drm_gem_lru_scan(struct drm_device *dev, + struct drm_gem_lru *lru, unsigned int nr_to_scan, unsigned long *remaining, bool (*shrink)(struct drm_gem_object *obj, struct ww_acquire_ctx *ticket), @@ -1650,9 +1646,9 @@ drm_gem_lru_scan(struct drm_gem_lru *lru, struct drm_gem_object *obj; unsigned freed = 0; - drm_gem_lru_init(&still_in_lru, lru->lock); + drm_gem_lru_init(&still_in_lru); - mutex_lock(lru->lock); + mutex_lock(&dev->gem_lru_mutex); while (freed < nr_to_scan) { obj = list_first_entry_or_null(&lru->list, typeof(*obj), lru_node); @@ -1675,7 +1671,7 @@ drm_gem_lru_scan(struct drm_gem_lru *lru, * rest of the loop body, to reduce contention with other * code paths that need the LRU lock */ - mutex_unlock(lru->lock); + mutex_unlock(&dev->gem_lru_mutex); if (ticket) ww_acquire_init(ticket, &reservation_ww_class); @@ -1709,7 +1705,7 @@ drm_gem_lru_scan(struct drm_gem_lru *lru, tail: drm_gem_object_put(obj); - mutex_lock(lru->lock); + mutex_lock(&dev->gem_lru_mutex); } /* @@ -1721,7 +1717,7 @@ tail: list_splice_tail(&still_in_lru.list, &lru->list); lru->count += still_in_lru.count; - mutex_unlock(lru->lock); + mutex_unlock(&dev->gem_lru_mutex); return freed; } diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 195f40e331e5..cc2bcd14b1c2 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -128,11 +128,10 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv, /* * Initialize the LRUs: */ - mutex_init(&priv->lru.lock); - drm_gem_lru_init(&priv->lru.unbacked, &priv->lru.lock); - drm_gem_lru_init(&priv->lru.pinned, &priv->lru.lock); - drm_gem_lru_init(&priv->lru.willneed, &priv->lru.lock); - drm_gem_lru_init(&priv->lru.dontneed, &priv->lru.lock); + drm_gem_lru_init(&priv->lru.unbacked); + drm_gem_lru_init(&priv->lru.pinned); + drm_gem_lru_init(&priv->lru.willneed); + drm_gem_lru_init(&priv->lru.dontneed); /* Initialize stall-on-fault */ spin_lock_init(&priv->fault_stall_lock); @@ -140,7 +139,7 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv, /* Teach lockdep about lock ordering wrt. shrinker: */ fs_reclaim_acquire(GFP_KERNEL); - might_lock(&priv->lru.lock); + might_lock(&ddev->gem_lru_mutex); fs_reclaim_release(GFP_KERNEL); if (priv->kms_init) { diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 6d847d593f1a..617b3c4b42c0 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -150,13 +150,6 @@ struct msm_drm_private { * DONTNEED state (ie. can be purged) */ struct drm_gem_lru dontneed; - - /** - * lock: - * - * Protects manipulation of all of the LRUs. - */ - struct mutex lock; } lru; struct notifier_block vmap_notifier; diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 2cb3ab04f125..efd3d3c9a449 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -177,11 +177,11 @@ static void update_lru_locked(struct drm_gem_object *obj) static void update_lru(struct drm_gem_object *obj) { - struct msm_drm_private *priv = obj->dev->dev_private; + struct drm_device *dev = obj->dev; - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); update_lru_locked(obj); - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); } static struct page **get_pages(struct drm_gem_object *obj) @@ -292,11 +292,11 @@ void msm_gem_pin_obj_locked(struct drm_gem_object *obj) static void pin_obj_locked(struct drm_gem_object *obj) { - struct msm_drm_private *priv = obj->dev->dev_private; + struct drm_device *dev = obj->dev; - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); msm_gem_pin_obj_locked(obj); - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); } struct page **msm_gem_pin_pages_locked(struct drm_gem_object *obj) @@ -487,16 +487,16 @@ int msm_gem_pin_vma_locked(struct drm_gem_object *obj, struct drm_gpuva *vma) void msm_gem_unpin_locked(struct drm_gem_object *obj) { - struct msm_drm_private *priv = obj->dev->dev_private; + struct drm_device *dev = obj->dev; struct msm_gem_object *msm_obj = to_msm_bo(obj); msm_gem_assert_locked(obj); - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); msm_obj->pin_count--; GEM_WARN_ON(msm_obj->pin_count < 0); update_lru_locked(obj); - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); } /* Special unpin path for use in fence-signaling path, avoiding the need @@ -507,10 +507,10 @@ void msm_gem_unpin_locked(struct drm_gem_object *obj) */ void msm_gem_unpin_active(struct drm_gem_object *obj) { - struct msm_drm_private *priv = obj->dev->dev_private; + struct drm_device *dev = obj->dev; struct msm_gem_object *msm_obj = to_msm_bo(obj); - GEM_WARN_ON(!mutex_is_locked(&priv->lru.lock)); + GEM_WARN_ON(!mutex_is_locked(&dev->gem_lru_mutex)); msm_obj->pin_count--; GEM_WARN_ON(msm_obj->pin_count < 0); @@ -797,12 +797,12 @@ void msm_gem_put_vaddr(struct drm_gem_object *obj) */ int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv) { - struct msm_drm_private *priv = obj->dev->dev_private; + struct drm_device *dev = obj->dev; struct msm_gem_object *msm_obj = to_msm_bo(obj); msm_gem_lock(obj); - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); if (msm_obj->madv != __MSM_MADV_PURGED) msm_obj->madv = madv; @@ -814,7 +814,7 @@ int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv) */ update_lru_locked(obj); - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); msm_gem_unlock(obj); @@ -824,7 +824,6 @@ int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv) void msm_gem_purge(struct drm_gem_object *obj) { struct drm_device *dev = obj->dev; - struct msm_drm_private *priv = obj->dev->dev_private; struct msm_gem_object *msm_obj = to_msm_bo(obj); msm_gem_assert_locked(obj); @@ -839,10 +838,10 @@ void msm_gem_purge(struct drm_gem_object *obj) put_pages(obj); - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); /* A one-way transition: */ msm_obj->madv = __MSM_MADV_PURGED; - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); drm_gem_free_mmap_offset(obj); diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c index 31fa51a44f86..c07af9602fee 100644 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c @@ -186,7 +186,7 @@ msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) if (!stages[i].cond) continue; stages[i].freed = - drm_gem_lru_scan(stages[i].lru, nr, + drm_gem_lru_scan(priv->dev, stages[i].lru, nr, &stages[i].remaining, stages[i].shrink, &ticket); @@ -255,7 +255,7 @@ msm_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr) unsigned long remaining = 0; for (idx = 0; lrus[idx] && unmapped < vmap_shrink_limit; idx++) { - unmapped += drm_gem_lru_scan(lrus[idx], + unmapped += drm_gem_lru_scan(priv->dev, lrus[idx], vmap_shrink_limit - unmapped, &remaining, vmap_shrink, diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 26ea8a28be47..3c6bc90c3d48 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -352,7 +352,7 @@ static int submit_fence_sync(struct msm_gem_submit *submit) static int submit_pin_objects(struct msm_gem_submit *submit) { - struct msm_drm_private *priv = submit->dev->dev_private; + struct drm_device *dev = submit->dev; int i, ret = 0; for (i = 0; i < submit->nr_bos; i++) { @@ -381,11 +381,11 @@ static int submit_pin_objects(struct msm_gem_submit *submit) * get_pages() which could trigger reclaim.. and if we held the LRU lock * could trigger deadlock with the shrinker). */ - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); for (i = 0; i < submit->nr_bos; i++) { msm_gem_pin_obj_locked(submit->bos[i].obj); } - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); submit->bos_pinned = true; diff --git a/drivers/gpu/drm/msm/msm_gem_vma.c b/drivers/gpu/drm/msm/msm_gem_vma.c index 1a952b171ed7..c4cfe036066b 100644 --- a/drivers/gpu/drm/msm/msm_gem_vma.c +++ b/drivers/gpu/drm/msm/msm_gem_vma.c @@ -702,7 +702,7 @@ static struct dma_fence * msm_vma_job_run(struct drm_sched_job *_job) { struct msm_vm_bind_job *job = to_msm_vm_bind_job(_job); - struct msm_drm_private *priv = job->vm->drm->dev_private; + struct drm_device *dev = job->vm->drm; struct msm_gem_vm *vm = to_msm_vm(job->vm); struct drm_gem_object *obj; int ret = vm->unusable ? -EINVAL : 0; @@ -745,13 +745,13 @@ msm_vma_job_run(struct drm_sched_job *_job) if (ret) msm_gem_vm_unusable(job->vm); - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); job_foreach_bo (obj, job) { msm_gem_unpin_active(obj); } - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); /* VM_BIND ops are synchronous, so no fence to wait on: */ return NULL; @@ -1305,7 +1305,7 @@ vm_bind_job_pin_objects(struct msm_vm_bind_job *job) return PTR_ERR(pages); } - struct msm_drm_private *priv = job->vm->drm->dev_private; + struct drm_device *dev = job->vm->drm; /* * A second loop while holding the LRU lock (a) avoids acquiring/dropping @@ -1314,10 +1314,10 @@ vm_bind_job_pin_objects(struct msm_vm_bind_job *job) * get_pages() which could trigger reclaim.. and if we held the LRU lock * could trigger deadlock with the shrinker). */ - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); job_foreach_bo (obj, job) msm_gem_pin_obj_locked(obj); - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); job->bos_pinned = true; diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 30ddb5351e98..2d6b930b766e 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -16,13 +16,13 @@ static struct dma_fence *msm_job_run(struct drm_sched_job *job) struct msm_gem_submit *submit = to_msm_submit(job); struct msm_fence_context *fctx = submit->ring->fctx; struct msm_gpu *gpu = submit->gpu; - struct msm_drm_private *priv = gpu->dev->dev_private; + struct drm_device *dev = gpu->dev; unsigned nr_cmds = submit->nr_cmds; int i; msm_fence_init(submit->hw_fence, fctx); - mutex_lock(&priv->lru.lock); + mutex_lock(&dev->gem_lru_mutex); for (i = 0; i < submit->nr_bos; i++) { struct drm_gem_object *obj = submit->bos[i].obj; @@ -32,7 +32,7 @@ static struct dma_fence *msm_job_run(struct drm_sched_job *job) submit->bos_pinned = false; - mutex_unlock(&priv->lru.lock); + mutex_unlock(&dev->gem_lru_mutex); /* TODO move submit path over to using a per-ring lock.. */ mutex_lock(&gpu->lock); diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h index bc78fb77cc27..768a8dae83c5 100644 --- a/include/drm/drm_device.h +++ b/include/drm/drm_device.h @@ -375,6 +375,13 @@ struct drm_device { * Root directory for debugfs files. */ struct dentry *debugfs_root; + + /** + * @gem_lru_mutex: + * + * Lock protecting movement of GEM objects between LRUs. + */ + struct mutex gem_lru_mutex; }; void drm_dev_set_dma_dev(struct drm_device *dev, struct device *dma_dev); diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h index 86f5846154f7..8a704f6a65c1 100644 --- a/include/drm/drm_gem.h +++ b/include/drm/drm_gem.h @@ -245,17 +245,11 @@ struct drm_gem_object_funcs { * for lockless &shrinker.count_objects, and provides * &drm_gem_lru_scan for driver's &shrinker.scan_objects * implementation. + * + * Any access to this kind of object must be done with + * drm_device::gem_lru_mutex held. */ struct drm_gem_lru { - /** - * @lock: - * - * Lock protecting movement of GEM objects between LRUs. All - * LRUs that the object can move between should be protected - * by the same lock. - */ - struct mutex *lock; - /** * @count: * @@ -453,6 +447,9 @@ struct drm_gem_object { * @lru: * * The current LRU list that the GEM object is on. + * + * Access to this field must be done with drm_device::gem_lru_mutex + * held. */ struct drm_gem_lru *lru; }; @@ -610,12 +607,13 @@ void drm_gem_unlock_reservations(struct drm_gem_object **objs, int count, int drm_gem_dumb_map_offset(struct drm_file *file, struct drm_device *dev, u32 handle, u64 *offset); -void drm_gem_lru_init(struct drm_gem_lru *lru, struct mutex *lock); +void drm_gem_lru_init(struct drm_gem_lru *lru); void drm_gem_lru_remove(struct drm_gem_object *obj); void drm_gem_lru_move_tail_locked(struct drm_gem_lru *lru, struct drm_gem_object *obj); void drm_gem_lru_move_tail(struct drm_gem_lru *lru, struct drm_gem_object *obj); unsigned long -drm_gem_lru_scan(struct drm_gem_lru *lru, +drm_gem_lru_scan(struct drm_device *dev, + struct drm_gem_lru *lru, unsigned int nr_to_scan, unsigned long *remaining, bool (*shrink)(struct drm_gem_object *obj, struct ww_acquire_ctx *ticket), -- cgit v1.2.3 From 8817005efbdfdf5d4e4814cb5dc52b53d12917d7 Mon Sep 17 00:00:00 2001 From: Qing Ming Date: Sat, 16 May 2026 15:08:49 +0800 Subject: cgroup/rstat: validate cpu before css_rstat_cpu() access css_rstat_updated() is exposed as a BPF kfunc and accepts a caller-provided cpu argument. The function uses cpu for per-cpu rstat lookups without checking whether it refers to a valid possible CPU. A BPF iter/cgroup program with CAP_BPF and CAP_PERFMON can pass an invalid cpu value. On an unfixed UBSCAN_BOUNDS test kernel, cpu == 0x7fffffff triggers: UBSAN: array-index-out-of-bounds in kernel/cgroup/rstat.c:31:9 index 2147483647 is out of range for type 'long unsigned int [64]' Call Trace: css_rstat_updated bpf_iter_run_prog cgroup_iter_seq_show bpf_seq_read Add cpu validation to the BPF-facing css_rstat_updated() kfunc and move the common implementation to __css_rstat_updated() for in-kernel callers. Fixes: a319185be9f5 ("cgroup: bpf: enable bpf programs to integrate with rstat") Signed-off-by: Qing Ming Signed-off-by: Tejun Heo --- block/blk-cgroup.c | 2 +- include/linux/cgroup.h | 1 + kernel/cgroup/rstat.c | 30 ++++++++++++++++++++---------- mm/memcontrol.c | 6 +++--- 4 files changed, 25 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 554c87bb4a86..bc63bd220865 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -2241,7 +2241,7 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - css_rstat_updated(&blkcg->css, cpu); + __css_rstat_updated(&blkcg->css, cpu); put_cpu(); } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4..e011dc43fcf1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -776,6 +776,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) /* * cgroup scalable recursive statistics. */ +void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_flush(struct cgroup_subsys_state *css); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 150e5871e66f..ed60ba119c68 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" +#include #include #include @@ -53,7 +54,7 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) } /** - * css_rstat_updated - keep track of updated rstat_cpu + * __css_rstat_updated - keep track of updated rstat_cpu * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * @@ -63,20 +64,17 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) * * NOTE: if the user needs the guarantee that the updater either add itself in * the lockless list or the concurrent flusher flushes its updated stats, a - * memory barrier is needed before the call to css_rstat_updated() i.e. a + * memory barrier is needed before the call to __css_rstat_updated() i.e. a * barrier after updating the per-cpu stats and before calling - * css_rstat_updated(). + * __css_rstat_updated(). */ -__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) +void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { struct llist_head *lhead; struct css_rstat_cpu *rstatc; struct llist_node *self; - /* - * Since bpf programs can call this function, prevent access to - * uninitialized rstat pointers. - */ + /* Prevent access to uninitialized rstat pointers. */ if (!css_uses_rstat(css)) return; @@ -125,6 +123,18 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) llist_add(&rstatc->lnode, lhead); } +/* + * BPF-facing wrapper for __css_rstat_updated(). Validate the caller-provided + * CPU before passing it to the internal rstat updater. + */ +__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) +{ + if (unlikely(cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu))) + return; + + __css_rstat_updated(css, cpu); +} + static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) { /* put @css and all ancestors on the corresponding updated lists */ @@ -170,7 +180,7 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) * flusher flush the stats updated by the updater who have * observed that they are already on the list. The * corresponding barrier pair for this one should be before - * css_rstat_updated() by the user. + * __css_rstat_updated() by the user. * * For now, there aren't any such user, so not adding the * barrier here but if such a use-case arise, please add @@ -614,7 +624,7 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); - css_rstat_updated(&cgrp->self, smp_processor_id()); + __css_rstat_updated(&cgrp->self, smp_processor_id()); put_cpu_ptr(rstatbc); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 051b82ebf371..c7e60f26013c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -579,7 +579,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, if (!val) return; - css_rstat_updated(&memcg->css, cpu); + __css_rstat_updated(&memcg->css, cpu); statc_pcpu = memcg->vmstats_percpu; for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { statc = this_cpu_ptr(statc_pcpu); @@ -2608,7 +2608,7 @@ static inline void account_slab_nmi_safe(struct mem_cgroup *memcg, struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id]; /* preemption is disabled in_nmi(). */ - css_rstat_updated(&memcg->css, smp_processor_id()); + __css_rstat_updated(&memcg->css, smp_processor_id()); if (idx == NR_SLAB_RECLAIMABLE_B) atomic_add(nr, &pn->slab_reclaimable); else @@ -2832,7 +2832,7 @@ static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val) mod_memcg_state(memcg, MEMCG_KMEM, val); } else { /* preemption is disabled in_nmi(). */ - css_rstat_updated(&memcg->css, smp_processor_id()); + __css_rstat_updated(&memcg->css, smp_processor_id()); atomic_add(val, &memcg->kmem_stat); } } -- cgit v1.2.3 From 8939562b16052c75b908d3c5f968bffb526fc6e9 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Mon, 18 May 2026 15:02:08 +0800 Subject: efi: efi.h: Remove extra semicolon Remove extra semicolons from comments. Signed-off-by: Rong Tao Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 72e76ec54641..ccbc35479684 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -61,7 +61,7 @@ typedef void *efi_handle_t; /* * The UEFI spec and EDK2 reference implementation both define EFI_GUID as - * struct { u32 a; u16; b; u16 c; u8 d[8]; }; and so the implied alignment + * struct { u32 a; u16 b; u16 c; u8 d[8]; }; and so the implied alignment * is 32 bits not 8 bits like our guid_t. In some cases (i.e., on 32-bit ARM), * this means that firmware services invoked by the kernel may assume that * efi_guid_t* arguments are 32-bit aligned, and use memory accessors that -- cgit v1.2.3 From 7122ff96068a03595bde2fbafaca82ca2ed8084e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 13 May 2026 12:00:16 -0300 Subject: RDMA/core: Do not read wild stack memory in uverbs_get_handler_fn() Sashiko points out the legacy write path in ib_uverbs_write() does allocate a struct uverbs_attr_bundle, but it doesn't wrap it in a bundle_priv so downcasting here isn't safe. Instead lift the method_elm out of the bundle_priv and use it for the debug function. The legacy write path will leave it set as NULL since the write method_elm uses a different type. Cc: stable@vger.kernel.org Fixes: 1de9287ece44 ("RDMA: Add ib_copy_validate_udata_in()") Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/ib_core_uverbs.c | 4 +--- drivers/infiniband/core/uverbs.h | 1 - drivers/infiniband/core/uverbs_ioctl.c | 26 ++++++++++++++------------ include/rdma/uverbs_ioctl.h | 1 + 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/ib_core_uverbs.c b/drivers/infiniband/core/ib_core_uverbs.c index 685030e0c60f..8a0e6fa2a528 100644 --- a/drivers/infiniband/core/ib_core_uverbs.c +++ b/drivers/infiniband/core/ib_core_uverbs.c @@ -422,12 +422,10 @@ uverbs_api_ioctl_handler_fn uverbs_get_handler_fn(struct ib_udata *udata) { struct uverbs_attr_bundle *bundle = rdma_udata_to_uverbs_attr_bundle(udata); - struct bundle_priv *pbundle = - container_of(&bundle->hdr, struct bundle_priv, bundle); lockdep_assert_held(&bundle->ufile->device->disassociate_srcu); - return srcu_dereference(pbundle->method_elm->handler, + return srcu_dereference(bundle->method_elm->handler, &bundle->ufile->device->disassociate_srcu); } diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index a74a2dff1301..f2e192b51e60 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -244,7 +244,6 @@ struct bundle_priv { size_t internal_used; struct radix_tree_root *radix; - const struct uverbs_api_ioctl_method *method_elm; void __rcu **radix_slots; unsigned long radix_slots_len; u32 method_key; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 33feb88d652b..2552a7efe2fb 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -397,13 +397,13 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs); - unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey; + unsigned int destroy_bkey = bundle->method_elm->destroy_bkey; unsigned int i; int ret; /* See uverbs_disassociate_api() */ handler = srcu_dereference( - pbundle->method_elm->handler, + bundle->method_elm->handler, &pbundle->bundle.ufile->device->disassociate_srcu); if (!handler) return -EIO; @@ -421,12 +421,12 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, } /* User space did not provide all the mandatory attributes */ - if (unlikely(!bitmap_subset(pbundle->method_elm->attr_mandatory, + if (unlikely(!bitmap_subset(bundle->method_elm->attr_mandatory, pbundle->bundle.attr_present, - pbundle->method_elm->key_bitmap_len))) + bundle->method_elm->key_bitmap_len))) return -EINVAL; - if (pbundle->method_elm->has_udata) + if (bundle->method_elm->has_udata) uverbs_fill_udata(bundle, &pbundle->bundle.driver_udata, UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT); else @@ -451,7 +451,7 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, * assume that the driver wrote to its UHW_OUT and flag userspace * appropriately. */ - if (!ret && pbundle->method_elm->has_udata) { + if (!ret && bundle->method_elm->has_udata) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT); @@ -472,7 +472,7 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, static void bundle_destroy(struct bundle_priv *pbundle, bool commit) { - unsigned int key_bitmap_len = pbundle->method_elm->key_bitmap_len; + unsigned int key_bitmap_len = pbundle->bundle.method_elm->key_bitmap_len; struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); struct bundle_alloc_head *memblock; @@ -560,7 +560,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, } /* Space for the pbundle->bundle.attrs flex array */ - pbundle->method_elm = method_elm; + pbundle->bundle.method_elm = method_elm; pbundle->method_key = attrs_iter.index; pbundle->bundle.ufile = ufile; pbundle->bundle.context = NULL; /* only valid if bundle has uobject */ @@ -569,10 +569,12 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter); pbundle->user_attrs = user_attrs; - pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len * - sizeof(*container_of(&pbundle->bundle, - struct uverbs_attr_bundle, hdr)->attrs), - sizeof(*pbundle->internal_buffer)); + pbundle->internal_used = ALIGN( + pbundle->bundle.method_elm->key_bitmap_len * + sizeof(*container_of(&pbundle->bundle, + struct uverbs_attr_bundle, hdr) + ->attrs), + sizeof(*pbundle->internal_buffer)); memset(pbundle->bundle.attr_present, 0, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index e2af17da3e32..c89428030d61 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -635,6 +635,7 @@ struct uverbs_attr_bundle { struct ib_uverbs_file *ufile; struct ib_ucontext *context; struct ib_uobject *uobject; + const struct uverbs_api_ioctl_method *method_elm; DECLARE_BITMAP(attr_present, UVERBS_API_ATTR_BKEY_LEN); ); struct uverbs_attr attrs[]; -- cgit v1.2.3 From 0cb5a74faa3bdcfa3b18735d554e12c0f615e35d Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Mon, 18 May 2026 15:44:57 +0200 Subject: net: airoha: Fix NPU RX DMA descriptor bits In an internal review from Airoha, it was notice that the RX DMA descriptor bits and mask are wrong. These values probably refer to an old NPU firmware never published. The previous value works correctly but it was reported that in some specific condition in mixed scenario with both Ethernet and WiFi offload it's possible that RX DMA descriptor signal wrong value with the problem to the RX ring or packets getting dropped. To handle these specific scenario, apply the new suggested bits mask from Airoha. Correct functionality of both AN7581 NPU and MT7996 variant were verified and confirmed working. Fixes: a7fc8c641cab ("net: airoha: Fix npu rx DMA definitions") Signed-off-by: Christian Marangi Acked-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260518134530.3683-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/soc/airoha/airoha_offload.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index d01ef4a6b3d7..7589fccfeef6 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -71,9 +71,9 @@ static inline void airoha_ppe_dev_check_skb(struct airoha_ppe_dev *dev, #define NPU_RX1_DESC_NUM 512 /* CTRL */ -#define NPU_RX_DMA_DESC_LAST_MASK BIT(27) -#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(26, 14) -#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(13, 1) +#define NPU_RX_DMA_DESC_LAST_MASK BIT(29) +#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(28, 15) +#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(14, 1) #define NPU_RX_DMA_DESC_DONE_MASK BIT(0) /* INFO */ #define NPU_RX_DMA_PKT_COUNT_MASK GENMASK(31, 29) -- cgit v1.2.3 From b8d7519352ba8c6df83259295d4a3bad093cae90 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 15 May 2026 15:13:25 -0700 Subject: net: shaper: rework the VALID marking (again) Recent commit changed the semantics from NOT_VALID to VALID. I didn't realize that the flags are not stored atomically with the entry in XArray. There's still a race of reader observing a VALID mark for a slot, getting interrupted, writer replacing the entry with a different one, reader continuing, fetching the entry which is now a different pointer than the pointer for which VALID was meant. The biggest consequence of this is that we may see a UAF since net_shaper_rollback() assumed that entries without VALID can be freed without observing RCU. Looks like the XArray marks are buying us nothing at this point. Let's convert the code to an explicit valid field. The smp_load_acquire() / smp_store_release() barriers are marginally cleaner. Reported-by: Sashiko Fixes: 93954b40f6a4 ("net-shapers: implement NL set and delete operations") Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260515221325.1685455-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/net_shaper.h | 1 + net/shaper/shaper.c | 45 ++++++++++++++++++--------------------------- 2 files changed, 19 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h index 5c3f49b52fe9..3939b816b001 100644 --- a/include/net/net_shaper.h +++ b/include/net/net_shaper.h @@ -53,6 +53,7 @@ struct net_shaper { /* private: */ u32 leaves; /* accounted only for NODE scope */ + bool valid; struct rcu_head rcu; }; diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 520cefdc3d90..dea9270f3e57 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -306,31 +306,24 @@ static void net_shaper_default_parent(const struct net_shaper_handle *handle, parent->id = 0; } -/* MARK_0 is already in use due to XA_FLAGS_ALLOC. The VALID mark is set on - * an entry only after the device-side configuration has completed - * successfully (see net_shaper_commit()). Lookups and dumps must filter on - * this mark to avoid exposing tentative entries inserted by - * net_shaper_pre_insert() while the driver call is still in flight. - */ -#define NET_SHAPER_VALID XA_MARK_1 - static struct net_shaper * net_shaper_lookup(struct net_shaper_binding *binding, const struct net_shaper_handle *handle) { u32 index = net_shaper_handle_to_index(handle); struct net_shaper_hierarchy *hierarchy; + struct net_shaper *cur; hierarchy = net_shaper_hierarchy_rcu(binding); - if (!hierarchy || !xa_get_mark(&hierarchy->shapers, index, - NET_SHAPER_VALID)) + if (!hierarchy) return NULL; - /* Pairs with smp_wmb() in net_shaper_commit(): if the entry is - * valid, its contents must be visible too. - */ - smp_rmb(); - return xa_load(&hierarchy->shapers, index); + cur = xa_load(&hierarchy->shapers, index); + /* Check valid before reading fields */ + if (!cur || !smp_load_acquire(&cur->valid)) + return NULL; + + return cur; } /* Allocate on demand the per device shaper's hierarchy container. @@ -444,12 +437,10 @@ static void net_shaper_commit(struct net_shaper_binding *binding, if (WARN_ON_ONCE(!cur)) continue; - /* Successful update: drop the tentative mark - * and update the hierarchy container. - */ + /* Successful update: update the hierarchy container... */ net_shaper_copy(cur, &shapers[i]); - smp_wmb(); - __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_VALID); + /* ... publish to lockless readers. */ + smp_store_release(&cur->valid, true); } xa_unlock(&hierarchy->shapers); } @@ -466,10 +457,10 @@ static void net_shaper_rollback(struct net_shaper_binding *binding) xa_lock(&hierarchy->shapers); xa_for_each(&hierarchy->shapers, index, cur) { - if (xa_get_mark(&hierarchy->shapers, index, NET_SHAPER_VALID)) + if (cur->valid) continue; __xa_erase(&hierarchy->shapers, index); - kfree(cur); + kfree_rcu(cur, rcu); } xa_unlock(&hierarchy->shapers); } @@ -882,12 +873,12 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb, goto out_unlock; for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, - U32_MAX, NET_SHAPER_VALID)); + U32_MAX, XA_PRESENT)); ctx->start_index++) { - /* Pairs with smp_wmb() in net_shaper_commit(): the entry - * is marked VALID, so its contents must be visible too. - */ - smp_rmb(); + /* Check valid before reading fields */ + if (!smp_load_acquire(&shaper->valid)) + continue; + ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; -- cgit v1.2.3 From 2b50aceafe6606ea52ed42aadd1b4d44a188aade Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 16 May 2026 00:05:13 +0100 Subject: crypto/krb5, rxrpc: Fix lack of pre-decrypt/pre-verify length checks Change the krb5 crypto library to provide facilities to precheck the length of the message about to be decrypted or verified. Fix AF_RXRPC to make use of this to validate DATA packets secured with RxGK. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Closes: https://sashiko.dev/#/patchset/20260511160753.607296-1-dhowells%40redhat.com Signed-off-by: David Howells cc: Herbert Xu cc: Simon Horman cc: Chuck Lever cc: linux-afs@lists.infradead.org Reviewed-by: Jeffrey Altman Tested-by: Marc Dionne Link: https://patch.msgid.link/20260515230516.2718212-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/crypto/krb5.rst | 17 ++++++++++---- crypto/krb5/krb5_api.c | 54 ++++++++++++++++++++++++++++++++++++++----- include/crypto/krb5.h | 9 +++++--- include/trace/events/rxrpc.h | 1 + net/rxrpc/rxgk.c | 15 ++++++++++-- 5 files changed, 81 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/Documentation/crypto/krb5.rst b/Documentation/crypto/krb5.rst index beffa0133446..f62e07ac6811 100644 --- a/Documentation/crypto/krb5.rst +++ b/Documentation/crypto/krb5.rst @@ -158,13 +158,22 @@ returned. When a message has been received, the location and size of the data with the message can be determined by calling:: - void crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, - enum krb5_crypto_mode mode, - size_t *_offset, size_t *_len); + int crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t *_offset, size_t *_len); The caller provides the offset and length of the message to the function, which then alters those values to indicate the region containing the data (plus any -padding). It is up to the caller to determine how much padding there is. +padding). It is up to the caller to determine how much padding there is. The +function returns an error if the length is too small or if the mode is +unsupported. An additional function:: + + int crypto_krb5_check_data_len(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t len, size_t min_content); + +is provided to just do a basic check that the decrypted/verified message would +have a sufficient minimum payload. Preparation Functions --------------------- diff --git a/crypto/krb5/krb5_api.c b/crypto/krb5/krb5_api.c index 23026d4206c8..c7ea40f900a7 100644 --- a/crypto/krb5/krb5_api.c +++ b/crypto/krb5/krb5_api.c @@ -134,27 +134,69 @@ EXPORT_SYMBOL(crypto_krb5_how_much_data); * Find the offset and size of the data in a secure message so that this * information can be used in the metadata buffer which will get added to the * digest by crypto_krb5_verify_mic(). + * + * Return: 0 if successful, -EBADMSG if the message is too short or -EINVAL if + * the mode is unsupported. */ -void crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, - enum krb5_crypto_mode mode, - size_t *_offset, size_t *_len) +int crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t *_offset, size_t *_len) { switch (mode) { case KRB5_CHECKSUM_MODE: + if (*_len < krb5->cksum_len) + return -EBADMSG; *_offset += krb5->cksum_len; *_len -= krb5->cksum_len; - return; + return 0; case KRB5_ENCRYPT_MODE: + if (*_len < krb5->conf_len + krb5->cksum_len) + return -EBADMSG; *_offset += krb5->conf_len; *_len -= krb5->conf_len + krb5->cksum_len; - return; + return 0; default: WARN_ON_ONCE(1); - return; + return -EINVAL; } } EXPORT_SYMBOL(crypto_krb5_where_is_the_data); +/** + * crypto_krb5_check_data_len - Check a message is big enough + * @krb5: The encoding to use. + * @mode: Mode of operation. + * @len: The length of the secure blob. + * @min_content: Minimum length of the content inside the blob. + * + * Check that a message is large enough to hold whatever bits the encryption + * type wants to glue on (nonce, checksum) plus a minimum amount of content. + * + * Return: 0 if successful, -EBADMSG if the message is too short or -EINVAL if + * the mode is unsupported. + */ +int crypto_krb5_check_data_len(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t len, size_t min_content) +{ + switch (mode) { + case KRB5_CHECKSUM_MODE: + if (len < krb5->cksum_len || + len - krb5->cksum_len < min_content) + return -EBADMSG; + return 0; + case KRB5_ENCRYPT_MODE: + if (len < krb5->conf_len + krb5->cksum_len || + len - (krb5->conf_len + krb5->cksum_len) < min_content) + return -EBADMSG; + return 0; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } +} +EXPORT_SYMBOL(crypto_krb5_check_data_len); + /* * Prepare the encryption with derived key data. */ diff --git a/include/crypto/krb5.h b/include/crypto/krb5.h index 71dd38f59be1..aac3ecf88467 100644 --- a/include/crypto/krb5.h +++ b/include/crypto/krb5.h @@ -121,9 +121,12 @@ size_t crypto_krb5_how_much_buffer(const struct krb5_enctype *krb5, size_t crypto_krb5_how_much_data(const struct krb5_enctype *krb5, enum krb5_crypto_mode mode, size_t *_buffer_size, size_t *_offset); -void crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, - enum krb5_crypto_mode mode, - size_t *_offset, size_t *_len); +int crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t *_offset, size_t *_len); +int crypto_krb5_check_data_len(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t len, size_t min_content); struct crypto_aead *crypto_krb5_prepare_encryption(const struct krb5_enctype *krb5, const struct krb5_buffer *TK, u32 usage, gfp_t gfp); diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 573f2df3a2c9..704a10de6670 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -71,6 +71,7 @@ EM(rxkad_abort_resp_unknown_tkt, "rxkad-resp-unknown-tkt") \ EM(rxkad_abort_resp_version, "rxkad-resp-version") \ /* RxGK security errors */ \ + EM(rxgk_abort_1_short_header, "rxgk1-short-hdr") \ EM(rxgk_abort_1_verify_mic_eproto, "rxgk1-vfy-mic-eproto") \ EM(rxgk_abort_2_decrypt_eproto, "rxgk2-dec-eproto") \ EM(rxgk_abort_2_short_data, "rxgk2-short-data") \ diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 0d5e654da918..26e723052a37 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -480,8 +480,12 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, _enter(""); - crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, - &data_offset, &data_len); + if (crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, + &data_offset, &data_len) < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_1_short_header); + goto put_gk; + } hdr = kzalloc_obj(*hdr, GFP_NOFS); if (!hdr) @@ -529,6 +533,13 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, _enter(""); + if (crypto_krb5_check_data_len(gk->krb5, KRB5_ENCRYPT_MODE, + len, sizeof(hdr)) < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_header); + goto error; + } + ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); if (ret < 0) { if (ret != -ENOMEM) -- cgit v1.2.3 From 1bbf0ced1d9db73ac7893c2187f3459288603e0d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 May 2026 08:46:11 +0000 Subject: tcp: fix stale per-CPU tcp_tw_isn leak enabling ISN prediction Blamed commit moved the TIME_WAIT-derived ISN from the skb control block to a per-CPU variable, assuming the value would always be consumed by tcp_conn_request() for the same packet that wrote it. That assumption is violated by multiple drop paths between the producer (__this_cpu_write(tcp_tw_isn, isn) in tcp_v{4,6}_rcv()) and the consumer (tcp_conn_request()): - min_ttl / min_hopcount check - xfrm policy check - tcp_inbound_hash() MD5/AO mismatch - tcp_filter() eBPF/SO_ATTACH_FILTER drop - th->syn && th->fin discard in tcp_rcv_state_process() TCP_LISTEN - psp_sk_rx_policy_check() in tcp_v{4,6}_do_rcv() - tcp_checksum_complete() in tcp_v{4,6}_do_rcv() - tcp_v{4,6}_cookie_check() returning NULL When a packet is dropped on any of these paths, tcp_tw_isn is left set. The next SYN processed on the same CPU then consumes the non zero value in tcp_conn_request(), receiving a potentially predictable ISN. This patch moves back tcp_tw_isn to skb->cb[], getting rid of the per-cpu variable. Note that tcp_v{4,6}_fill_cb() do not set it. Very litle impact on overall code size/complexity: $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/0 grow/shrink: 2/1 up/down: 8/-15 (-7) Function old new delta tcp_v6_rcv 3038 3042 +4 tcp_v4_rcv 3035 3039 +4 tcp_conn_request 2938 2923 -15 Total: Before=24436060, After=24436053, chg -0.00% Fixes: 41eecbd712b7 ("tcp: replace TCP_SKB_CB(skb)->tcp_tw_isn with a per-cpu field") Reported-by: Chris Mason Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260519084611.2485277-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 7 ++++--- net/ipv4/tcp.c | 3 --- net/ipv4/tcp_input.c | 15 ++++++--------- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 3 ++- 5 files changed, 14 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index ecbadcb3a744..98848db62894 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -65,8 +65,6 @@ static inline void tcp_orphan_count_dec(void) this_cpu_dec(tcp_orphan_count); } -DECLARE_PER_CPU(u32, tcp_tw_isn); - void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) @@ -1102,10 +1100,13 @@ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { - /* Note : + /* Notes : + * tcp_tw_isn is used in input path only + * (isn chosen by tcp_timewait_state_process()) * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ + u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 432fa28e47d4..389a7cc17110 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -299,9 +299,6 @@ enum { DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); -DEFINE_PER_CPU(u32, tcp_tw_isn); -EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); - long sysctl_tcp_mem[3] __read_mostly; DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d5c9e65d9760..de9f68a9c0cf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7589,6 +7589,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; + u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -7599,20 +7600,16 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct dst_entry *dst; struct flowi fl; u8 syncookies; - u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif - isn = __this_cpu_read(tcp_tw_isn); - if (isn) { - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - __this_cpu_write(tcp_tw_isn, 0); - } else { + /* If isn is non-zero, this SYN originally matched a TIME_WAIT socket. + * TW sockets are converted to open requests without limitations, + * we skip the queue limits and syncookie checks in the block below. + */ + if (!isn) { syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c0526cc03980..fdc81150ff6c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2198,6 +2198,7 @@ lookup: } } + isn = 0; process: if (static_branch_unlikely(&ip4_min_ttl)) { /* min_ttl can be changed concurrently from do_ip_setsockopt() */ @@ -2227,6 +2228,7 @@ process: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); + TCP_SKB_CB(skb)->tcp_tw_isn = isn; skb->dev = NULL; @@ -2313,7 +2315,6 @@ do_time_wait: sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; - __this_cpu_write(tcp_tw_isn, isn); goto process; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d13d49bfef19..36d75fb50a70 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1839,6 +1839,7 @@ lookup: } } + isn = 0; process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ @@ -1868,6 +1869,7 @@ process: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); + TCP_SKB_CB(skb)->tcp_tw_isn = isn; skb->dev = NULL; @@ -1956,7 +1958,6 @@ do_time_wait: sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; - __this_cpu_write(tcp_tw_isn, isn); goto process; } -- cgit v1.2.3 From a494d3c8d5392bcdff83c2a593df0c160ff9f322 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 30 Apr 2026 12:28:16 +0900 Subject: ring-buffer: Flush and stop persistent ring buffer on panic On real hardware, panic and machine reboot may not flush hardware cache to memory. This means the persistent ring buffer, which relies on a coherent state of memory, may not have its events written to the buffer and they may be lost. Moreover, there may be inconsistency with the counters which are used for validation of the integrity of the persistent ring buffer which may cause all data to be discarded. To avoid this issue, stop recording of the ring buffer on panic and flush the cache of the ring buffer's memory. Fixes: e645535a954a ("tracing: Add option to use memmapped memory for trace boot instance") Cc: stable@vger.kernel.org Cc: Will Deacon Cc: Mathieu Desnoyers Cc: Ian Rogers Link: https://patch.msgid.link/177751969602.2136606.12031934362587643488.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Acked-by: Catalin Marinas Acked-by: Geert Uytterhoeven Signed-off-by: Steven Rostedt --- arch/alpha/include/asm/Kbuild | 1 + arch/arc/include/asm/Kbuild | 1 + arch/arm/include/asm/Kbuild | 1 + arch/arm64/include/asm/ring_buffer.h | 10 ++++++++++ arch/csky/include/asm/Kbuild | 1 + arch/hexagon/include/asm/Kbuild | 1 + arch/loongarch/include/asm/Kbuild | 1 + arch/m68k/include/asm/Kbuild | 1 + arch/microblaze/include/asm/Kbuild | 1 + arch/mips/include/asm/Kbuild | 1 + arch/nios2/include/asm/Kbuild | 1 + arch/openrisc/include/asm/Kbuild | 1 + arch/parisc/include/asm/Kbuild | 1 + arch/powerpc/include/asm/Kbuild | 1 + arch/riscv/include/asm/Kbuild | 1 + arch/s390/include/asm/Kbuild | 1 + arch/sh/include/asm/Kbuild | 1 + arch/sparc/include/asm/Kbuild | 1 + arch/um/include/asm/Kbuild | 1 + arch/x86/include/asm/Kbuild | 1 + arch/xtensa/include/asm/Kbuild | 1 + include/asm-generic/ring_buffer.h | 13 +++++++++++++ kernel/trace/ring_buffer.c | 22 ++++++++++++++++++++++ 23 files changed, 65 insertions(+) create mode 100644 arch/arm64/include/asm/ring_buffer.h create mode 100644 include/asm-generic/ring_buffer.h (limited to 'include') diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 483965c5a4de..b154b4e3dfa8 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -5,4 +5,5 @@ generic-y += agp.h generic-y += asm-offsets.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index 4c69522e0328..483caacc6988 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -5,5 +5,6 @@ generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += user.h generic-y += text-patching.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 03657ff8fbe3..decad5f2c826 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += early_ioremap.h generic-y += extable.h generic-y += flat.h generic-y += parport.h +generic-y += ring_buffer.h generated-y += mach-types.h generated-y += unistd-nr.h diff --git a/arch/arm64/include/asm/ring_buffer.h b/arch/arm64/include/asm/ring_buffer.h new file mode 100644 index 000000000000..62316c406888 --- /dev/null +++ b/arch/arm64/include/asm/ring_buffer.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_ARM64_RING_BUFFER_H +#define _ASM_ARM64_RING_BUFFER_H + +#include + +/* Flush D-cache on persistent ring buffer */ +#define arch_ring_buffer_flush_range(start, end) dcache_clean_pop(start, end) + +#endif /* _ASM_ARM64_RING_BUFFER_H */ diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 3a5c7f6e5aac..7dca0c6cdc84 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -9,6 +9,7 @@ generic-y += qrwlock.h generic-y += qrwlock_types.h generic-y += qspinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += user.h generic-y += vmlinux.lds.h generic-y += text-patching.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 1efa1e993d4b..0f887d4238ed 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -5,4 +5,5 @@ generic-y += extable.h generic-y += iomap.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 9034b583a88a..7e92957baf6a 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -10,5 +10,6 @@ generic-y += qrwlock.h generic-y += user.h generic-y += ioctl.h generic-y += mmzone.h +generic-y += ring_buffer.h generic-y += statfs.h generic-y += text-patching.h diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index b282e0dd8dc1..62543bf305ff 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -3,5 +3,6 @@ generated-y += syscall_table.h generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += spinlock.h generic-y += text-patching.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 7178f990e8b3..0030309b47ad 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += syscalls.h generic-y += tlb.h generic-y += user.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 684569b2ecd6..9771c3d85074 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -12,5 +12,6 @@ generic-y += mcs_spinlock.h generic-y += parport.h generic-y += qrwlock.h generic-y += qspinlock.h +generic-y += ring_buffer.h generic-y += user.h generic-y += text-patching.h diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index 28004301c236..0a2530964413 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += cmpxchg.h generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += spinlock.h generic-y += user.h generic-y += text-patching.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index cef49d60d74c..8aa34621702d 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -8,4 +8,5 @@ generic-y += spinlock_types.h generic-y += spinlock.h generic-y += qrwlock_types.h generic-y += qrwlock.h +generic-y += ring_buffer.h generic-y += user.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 4fb596d94c89..d48d158f7241 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -4,4 +4,5 @@ generated-y += syscall_table_64.h generic-y += agp.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += user.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 2e23533b67e3..805b5aeebb6f 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -5,4 +5,5 @@ generated-y += syscall_table_spu.h generic-y += agp.h generic-y += mcs_spinlock.h generic-y += qrwlock.h +generic-y += ring_buffer.h generic-y += early_ioremap.h diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index bd5fc9403295..7721b63642f4 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -14,5 +14,6 @@ generic-y += ticket_spinlock.h generic-y += qrwlock.h generic-y += qrwlock_types.h generic-y += qspinlock.h +generic-y += ring_buffer.h generic-y += user.h generic-y += vmlinux.lds.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 80bad7de7a04..0c1fc47c3ba0 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -7,3 +7,4 @@ generated-y += unistd_nr.h generic-y += asm-offsets.h generic-y += mcs_spinlock.h generic-y += mmzone.h +generic-y += ring_buffer.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 4d3f10ed8275..f0403d3ee8ab 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -3,4 +3,5 @@ generated-y += syscall_table.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 17ee8a273aa6..49c6bb326b75 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -4,4 +4,5 @@ generated-y += syscall_table_64.h generic-y += agp.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += ring_buffer.h generic-y += text-patching.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 1b9b82bbe322..2a1629ba8140 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -17,6 +17,7 @@ generic-y += module.lds.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h +generic-y += ring_buffer.h generic-y += runtime-const.h generic-y += softirq_stack.h generic-y += switch_to.h diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4566000e15c4..078fd2c0d69d 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -14,3 +14,4 @@ generic-y += early_ioremap.h generic-y += fprobe.h generic-y += mcs_spinlock.h generic-y += mmzone.h +generic-y += ring_buffer.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 13fe45dea296..e57af619263a 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -6,5 +6,6 @@ generic-y += mcs_spinlock.h generic-y += parport.h generic-y += qrwlock.h generic-y += qspinlock.h +generic-y += ring_buffer.h generic-y += user.h generic-y += text-patching.h diff --git a/include/asm-generic/ring_buffer.h b/include/asm-generic/ring_buffer.h new file mode 100644 index 000000000000..201d2aee1005 --- /dev/null +++ b/include/asm-generic/ring_buffer.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generic arch dependent ring_buffer macros. + */ +#ifndef __ASM_GENERIC_RING_BUFFER_H__ +#define __ASM_GENERIC_RING_BUFFER_H__ + +#include + +/* Flush cache on ring buffer range if needed. Do nothing by default. */ +#define arch_ring_buffer_flush_range(start, end) do { } while (0) + +#endif /* __ASM_GENERIC_RING_BUFFER_H__ */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fcd93d49851e..7b07d2004cc6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include +#include #include #include #include @@ -559,6 +561,7 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; + struct notifier_block flush_nb; struct ring_buffer_meta *meta; @@ -2521,6 +2524,16 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } +/* Stop recording on a persistent buffer and flush cache if needed. */ +static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data) +{ + struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb); + + ring_buffer_record_off(buffer); + arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end); + return NOTIFY_DONE; +} + static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, @@ -2651,6 +2664,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); + /* Persistent ring buffer needs to flush cache before reboot. */ + if (start && end) { + buffer->flush_nb.notifier_call = rb_flush_buffer_cb; + atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb); + } + return_ptr(buffer); fail_free_buffers: @@ -2749,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer) { int cpu; + if (buffer->range_addr_start && buffer->range_addr_end) + atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb); + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); -- cgit v1.2.3 From 215c90ee656114f5e8c32408228d97082f8e0eef Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 6 May 2026 13:57:00 +0200 Subject: device property: set fwnode->secondary to NULL in fwnode_init() If a firmware node is allocated on the stack (for instance: temporary software node whose life-time we control) or on the heap - but using a non-zeroing allocation function - and initialized using fwnode_init(), its secondary pointer will contain uninitalized memory which likely will be neither NULL nor IS_ERR() and so may end up being dereferenced (for example: in dev_to_swnode()). Set fwnode->secondary to NULL on initialization. Cc: stable Fixes: 01bb86b380a3 ("driver core: Add fwnode_init()") Signed-off-by: Bartosz Golaszewski Reviewed-by: Rafael J. Wysocki (Intel) Reviewed-by: Andy Shevchenko Reviewed-by: Sakari Ailus Link: https://patch.msgid.link/20260506115701.23035-1-bartosz.golaszewski@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 80b38fbf2121..31df7608737e 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -208,6 +208,7 @@ struct fwnode_operations { static inline void fwnode_init(struct fwnode_handle *fwnode, const struct fwnode_operations *ops) { + fwnode->secondary = NULL; fwnode->ops = ops; INIT_LIST_HEAD(&fwnode->consumers); INIT_LIST_HEAD(&fwnode->suppliers); -- cgit v1.2.3