summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-06-15 00:30:58 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2026-06-15 00:30:58 +0300
commit9c9e6bd4cca02f2d183eb260451fb6018f9ee67e (patch)
tree5c351e1db3b13ffce9ce02c95173e58d35975d1f
parent5d15ab717d503ff10b585a144870648b9a88c616 (diff)
parent38205ecbe6b6dc47968ad4e9c978e2117720969e (diff)
downloadlinux-9c9e6bd4cca02f2d183eb260451fb6018f9ee67e.tar.xz
Merge tag 'kernel-7.2-rc1.task_exec_state' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull task_exec_state updates from Christian Brauner: "This introduces a new per-task task_exec_state structure and relocates the dumpable mode and the user namespace captured at execve() from mm_struct onto it. It stays attached to the task for its full lifetime. __ptrace_may_access() and several /proc owner and visibility checks need to consult two pieces of state for any observable task, including zombies that have already gone through exit_mm(): the dumpable mode and the user namespace captured at execve(). Both live on mm_struct today, which exit_mm() clears from the task long before the task is reaped. A reader that races with do_exit() observes task->mm == NULL and either fails the check or falls back to init_user_ns - which denies legitimate access to non-dumpable zombies that were running in a nested user namespace. mm_struct loses ->user_ns and the dumpability bits in ->flags. MMF_DUMPABLE_BITS is reserved so the MMF_DUMP_FILTER_* layout exposed via /proc/<pid>/coredump_filter stays stable. task->user_dumpable and its exit_mm() snapshot are removed. task_exec_state is the privilege domain established by an execve(). Within a thread group it is shared via refcount; across thread groups each task has its own: - CLONE_VM siblings (thread-group members, io_uring workers) refcount-share the parent's exec_state. - Non-CLONE_VM clones (fork(), vfork() without CLONE_VM) allocate a fresh exec_state inheriting the parent's dumpable mode and user_ns. - execve() in the child allocates a fresh instance and installs it under task_lock + exec_update_lock via task_exec_state_replace(). - Credential changes (setresuid, capset, ...) and prctl(PR_SET_DUMPABLE) update dumpability on the current task's exec_state, i.e., on the thread group's shared instance. On top of this exec_mmap() no longer tears down the old mm while holding exec_update_lock for writing and cred_guard_mutex. Neither lock is needed for that: exec_update_lock only exists to make the mm swap atomic with the later commit_creds() and all its readers operate on the new mm; none looks at the detached old mm. The cost was real: __mmput() runs exit_mmap() over the entire old address space and can block in exit_aio() waiting for in-flight AIO, so execve() of a large process blocked ptrace_attach() and every exec_update_lock reader for the duration of the teardown. The old mm is now stashed in bprm->old_mm and released from setup_new_exec() after both locks are dropped, with a backstop in free_bprm() for the error paths" * tag 'kernel-7.2-rc1.task_exec_state' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: exec: free the old mm outside the exec locks exec_state: relocate dumpable information ptrace: add ptracer_access_allowed() exec: introduce struct task_exec_state sched/coredump: introduce enum task_dumpable
-rw-r--r--arch/arm64/kernel/mte.c6
-rw-r--r--drivers/firmware/efi/efi.c1
-rw-r--r--fs/coredump.c22
-rw-r--r--fs/exec.c65
-rw-r--r--fs/pidfs.c23
-rw-r--r--fs/proc/base.c39
-rw-r--r--include/linux/binfmts.h3
-rw-r--r--include/linux/coredump.h4
-rw-r--r--include/linux/mm_types.h9
-rw-r--r--include/linux/ptrace.h1
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/sched/coredump.h47
-rw-r--r--include/linux/sched/exec_state.h31
-rw-r--r--init/init_task.c10
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/exec_state.c119
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c33
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/ptrace.c51
-rw-r--r--kernel/sys.c6
-rw-r--r--mm/init-mm.c1
23 files changed, 329 insertions, 155 deletions
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 6874b16d0657..1a9aad6ef22a 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/prctl.h>
+#include <linux/ptrace.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/string.h>
@@ -537,16 +538,13 @@ static int access_remote_tags(struct task_struct *tsk, unsigned long addr,
if (!mm)
return -EPERM;
- if (!tsk->ptrace || (current != tsk->parent) ||
- ((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptracer_capable(tsk, mm->user_ns))) {
+ if (!ptracer_access_allowed(tsk)) {
mmput(mm);
return -EPERM;
}
ret = __access_remote_tags(mm, addr, kiov, gup_flags);
mmput(mm);
-
return ret;
}
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 318d1cc9a066..0327a39d31fa 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -73,7 +73,6 @@ struct mm_struct efi_mm = {
MMAP_LOCK_INITIALIZER(efi_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(efi_mm.mmlist),
- .user_ns = &init_user_ns,
#ifdef CONFIG_SCHED_MM_CID
.mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(efi_mm.mm_cid.lock),
#endif
diff --git a/fs/coredump.c b/fs/coredump.c
index bb6fdb1f458e..e943569e9b6d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -395,8 +395,7 @@ static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm,
cred->gid));
break;
case 'd':
- err = cn_printf(cn, "%d",
- __get_dumpable(cprm->mm_flags));
+ err = cn_printf(cn, "%d", cprm->dumpable);
break;
/* signal that caused the coredump */
case 's':
@@ -869,11 +868,11 @@ static inline void coredump_sock_shutdown(struct file *file) { }
static inline bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) { return false; }
#endif
-/* cprm->mm_flags contains a stable snapshot of dumpability flags. */
+/* cprm->dumpable is the snapshot of task dumpability at dump start. */
static inline bool coredump_force_suid_safe(const struct coredump_params *cprm)
{
/* Require nonrelative corefile path and be extra careful. */
- return __get_dumpable(cprm->mm_flags) == SUID_DUMP_ROOT;
+ return cprm->dumpable == TASK_DUMPABLE_ROOT;
}
static bool coredump_file(struct core_name *cn, struct coredump_params *cprm,
@@ -1085,7 +1084,7 @@ static inline bool coredump_skip(const struct coredump_params *cprm,
return true;
if (!binfmt->core_dump)
return true;
- if (!__get_dumpable(cprm->mm_flags))
+ if (cprm->dumpable == TASK_DUMPABLE_OFF)
return true;
return false;
}
@@ -1170,14 +1169,9 @@ void vfs_coredump(const kernel_siginfo_t *siginfo)
struct coredump_params cprm = {
.siginfo = siginfo,
.limit = rlimit(RLIMIT_CORE),
- /*
- * We must use the same mm->flags while dumping core to avoid
- * inconsistency of bit flags, since this flag is not protected
- * by any locks.
- *
- * Note that we only care about MMF_DUMP* flags.
- */
- .mm_flags = __mm_flags_get_dumpable(mm),
+ /* Snapshot MMF_DUMP_FILTER_* (unlocked) and dumpable for the dump. */
+ .mm_flags = __mm_flags_get_word(mm),
+ .dumpable = task_exec_state_get_dumpable(current),
.vma_meta = NULL,
.cpu = raw_smp_processor_id(),
};
@@ -1419,7 +1413,7 @@ EXPORT_SYMBOL(dump_align);
void validate_coredump_safety(void)
{
- if (suid_dumpable == SUID_DUMP_ROOT &&
+ if (suid_dumpable == TASK_DUMPABLE_ROOT &&
core_pattern[0] != '/' && core_pattern[0] != '|' && core_pattern[0] != '@') {
coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: "
diff --git a/fs/exec.c b/fs/exec.c
index ba12b4c466f6..824b46c069ae 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -35,6 +35,7 @@
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
+#include <linux/sched/exec_state.h>
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
@@ -263,6 +264,9 @@ static int bprm_mm_init(struct linux_binprm *bprm)
if (!mm)
goto err;
+ /* Staged for would_dump() narrowing; consumed by begin_new_exec(). */
+ bprm->user_ns = get_user_ns(current_user_ns());
+
/* Save current stack limit for all calculations made during exec. */
task_lock(current->group_leader);
bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
@@ -832,14 +836,21 @@ EXPORT_SYMBOL(read_code);
/*
* Maps the mm_struct mm into the current task struct.
* On success, this function returns with exec_update_lock
- * held for writing.
+ * held for writing. The replaced address space is stashed in
+ * bprm->old_mm for setup_new_exec() to release outside the lock.
*/
-static int exec_mmap(struct mm_struct *mm)
+static int exec_mmap(struct linux_binprm *bprm)
{
+ struct task_exec_state *exec_state __free(put_task_exec_state) = NULL;
+ struct mm_struct *mm = bprm->mm;
struct task_struct *tsk;
struct mm_struct *old_mm, *active_mm;
int ret;
+ exec_state = alloc_task_exec_state(bprm->user_ns);
+ if (!exec_state)
+ return -ENOMEM;
+
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
@@ -870,6 +881,7 @@ static int exec_mmap(struct mm_struct *mm)
tsk->active_mm = mm;
tsk->mm = mm;
mm_init_cid(mm, tsk);
+ exec_state = task_exec_state_replace(tsk, exec_state);
/*
* This prevents preemption while active_mm is being loaded and
* it and mm are being updated, which could cause problems for
@@ -888,15 +900,22 @@ static int exec_mmap(struct mm_struct *mm)
if (old_mm) {
mmap_read_unlock(old_mm);
BUG_ON(active_mm != old_mm);
- setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
- mm_update_next_owner(old_mm);
- mmput(old_mm);
+ /* Defer teardown to setup_new_exec(), outside the exec locks. */
+ bprm->old_mm = old_mm;
return 0;
}
mmdrop_lazy_tlb(active_mm);
return 0;
}
+/* Release the address space replaced by exec, outside the exec locks. */
+static void exec_mm_put_old(struct mm_struct *old_mm)
+{
+ setmax_mm_hiwater_rss(&current->signal->maxrss, old_mm);
+ mm_update_next_owner(old_mm);
+ mmput(old_mm);
+}
+
static int de_thread(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
@@ -1145,7 +1164,7 @@ int begin_new_exec(struct linux_binprm * bprm)
* Release all of the old mmap stuff
*/
acct_arg_size(bprm, 0);
- retval = exec_mmap(bprm->mm);
+ retval = exec_mmap(bprm);
if (retval)
goto out;
@@ -1210,9 +1229,9 @@ int begin_new_exec(struct linux_binprm * bprm)
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
!(uid_eq(current_euid(), current_uid()) &&
gid_eq(current_egid(), current_gid())))
- set_dumpable(current->mm, suid_dumpable);
+ task_exec_state_set_dumpable(suid_dumpable);
else
- set_dumpable(current->mm, SUID_DUMP_USER);
+ task_exec_state_set_dumpable(TASK_DUMPABLE_OWNER);
perf_event_exec();
@@ -1261,7 +1280,7 @@ int begin_new_exec(struct linux_binprm * bprm)
* wait until new credentials are committed
* by commit_creds() above
*/
- if (get_dumpable(me->mm) != SUID_DUMP_USER)
+ if (task_exec_state_get_dumpable(me) != TASK_DUMPABLE_OWNER)
perf_event_exit_task(me);
/*
* cred_guard_mutex must be held at least to this point to prevent
@@ -1298,14 +1317,14 @@ void would_dump(struct linux_binprm *bprm, struct file *file)
struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
- /* Ensure mm->user_ns contains the executable */
- user_ns = old = bprm->mm->user_ns;
+ /* Ensure bprm->user_ns contains the executable. */
+ user_ns = old = bprm->user_ns;
while ((user_ns != &init_user_ns) &&
!privileged_wrt_inode_uidgid(user_ns, idmap, inode))
user_ns = user_ns->parent;
if (old != user_ns) {
- bprm->mm->user_ns = get_user_ns(user_ns);
+ bprm->user_ns = get_user_ns(user_ns);
put_user_ns(old);
}
}
@@ -1328,6 +1347,12 @@ void setup_new_exec(struct linux_binprm * bprm)
me->mm->task_size = TASK_SIZE;
up_write(&me->signal->exec_update_lock);
mutex_unlock(&me->signal->cred_guard_mutex);
+
+ /* The exec locks are dropped: release the old address space now. */
+ if (bprm->old_mm) {
+ exec_mm_put_old(bprm->old_mm);
+ bprm->old_mm = NULL;
+ }
}
EXPORT_SYMBOL(setup_new_exec);
@@ -1375,6 +1400,8 @@ static void free_bprm(struct linux_binprm *bprm)
acct_arg_size(bprm, 0);
mmput(bprm->mm);
}
+ if (bprm->user_ns)
+ put_user_ns(bprm->user_ns);
free_arg_pages(bprm);
if (bprm->cred) {
/* in case exec fails before de_thread() succeeds */
@@ -1382,6 +1409,9 @@ static void free_bprm(struct linux_binprm *bprm)
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
+ /* exec swapped the mm but failed before setup_new_exec() freed it */
+ if (bprm->old_mm)
+ exec_mm_put_old(bprm->old_mm);
do_close_execat(bprm->file);
if (bprm->executable)
fput(bprm->executable);
@@ -1905,17 +1935,6 @@ void set_binfmt(struct linux_binfmt *new)
}
EXPORT_SYMBOL(set_binfmt);
-/*
- * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
- */
-void set_dumpable(struct mm_struct *mm, int value)
-{
- if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
- return;
-
- __mm_flags_set_mask_dumpable(mm, value);
-}
-
static inline struct user_arg_ptr native_arg(const char __user *const __user *p)
{
return (struct user_arg_ptr){.ptr.native = p};
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 1cce4f34a051..b2ff950a096e 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -338,14 +338,14 @@ static inline bool pid_in_current_pidns(const struct pid *pid)
return false;
}
-static __u32 pidfs_coredump_mask(unsigned long mm_flags)
+static __u32 pidfs_coredump_mask(enum task_dumpable dumpable)
{
- switch (__get_dumpable(mm_flags)) {
- case SUID_DUMP_USER:
+ switch (dumpable) {
+ case TASK_DUMPABLE_OWNER:
return PIDFD_COREDUMP_USER;
- case SUID_DUMP_ROOT:
+ case TASK_DUMPABLE_ROOT:
return PIDFD_COREDUMP_ROOT;
- case SUID_DUMP_DISABLE:
+ case TASK_DUMPABLE_OFF:
return PIDFD_COREDUMP_SKIP;
default:
WARN_ON_ONCE(true);
@@ -433,14 +433,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
return -ESRCH;
if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) {
- guard(task_lock)(task);
- if (task->mm) {
- unsigned long flags = __mm_flags_get_dumpable(task->mm);
-
- kinfo.coredump_mask = pidfs_coredump_mask(flags);
- kinfo.mask |= PIDFD_INFO_COREDUMP;
- /* No coredump actually took place, so no coredump signal. */
- }
+ kinfo.coredump_mask = pidfs_coredump_mask(task_exec_state_get_dumpable(task));
+ kinfo.mask |= PIDFD_INFO_COREDUMP;
+ /* No coredump actually took place, so no coredump signal. */
}
/* Unconditionally return identifiers and credentials, the rest only on request */
@@ -779,7 +774,7 @@ void pidfs_coredump(const struct coredump_params *cprm)
VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
/* Note how we were coredumped and that we coredumped. */
- attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) |
+ attr->coredump_mask = pidfs_coredump_mask(cprm->dumpable) |
PIDFD_COREDUMPED;
/* If coredumping is set to skip we should never end up here. */
VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d9acfa89c894..65f56136ec3f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -91,6 +91,7 @@
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/debug.h>
+#include <linux/sched/exec_state.h>
#include <linux/sched/stat.h>
#include <linux/posix-timers.h>
#include <linux/time_namespace.h>
@@ -1893,7 +1894,6 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
cred = __task_cred(task);
uid = cred->euid;
gid = cred->egid;
- rcu_read_unlock();
/*
* Before the /proc/pid/status file was created the only way to read
@@ -1903,29 +1903,22 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
* made this apply to all per process world readable and executable
* directories.
*/
- if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
- struct mm_struct *mm;
- task_lock(task);
- mm = task->mm;
- /* Make non-dumpable tasks owned by some root */
- if (mm) {
- if (get_dumpable(mm) != SUID_DUMP_USER) {
- struct user_namespace *user_ns = mm->user_ns;
-
- uid = make_kuid(user_ns, 0);
- if (!uid_valid(uid))
- uid = GLOBAL_ROOT_UID;
-
- gid = make_kgid(user_ns, 0);
- if (!gid_valid(gid))
- gid = GLOBAL_ROOT_GID;
- }
- } else {
- uid = GLOBAL_ROOT_UID;
- gid = GLOBAL_ROOT_GID;
+ if (mode != (S_IFDIR | S_IRUGO | S_IXUGO)) {
+ struct task_exec_state *exec_state;
+
+ exec_state = task_exec_state_rcu(task);
+ if (READ_ONCE(exec_state->dumpable) != TASK_DUMPABLE_OWNER) {
+ uid = make_kuid(exec_state->user_ns, 0);
+ if (!uid_valid(uid))
+ uid = GLOBAL_ROOT_UID;
+
+ gid = make_kgid(exec_state->user_ns, 0);
+ if (!gid_valid(gid))
+ gid = GLOBAL_ROOT_GID;
}
- task_unlock(task);
}
+ rcu_read_unlock();
+
*ruid = uid;
*rgid = gid;
}
@@ -2965,7 +2958,7 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
ret = 0;
mm = get_task_mm(task);
if (mm) {
- unsigned long flags = __mm_flags_get_dumpable(mm);
+ unsigned long flags = __mm_flags_get_word(mm);
len = snprintf(buffer, sizeof(buffer), "%08lx\n",
((flags & MMF_DUMP_FILTER_MASK) >>
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 65abd5ab8836..2c77e383e737 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -25,6 +25,9 @@ struct linux_binprm {
struct page *page[MAX_ARG_PAGES];
#endif
struct mm_struct *mm;
+ struct mm_struct *old_mm; /* replaced address space, freed by setup_new_exec() */
+ /* user_ns published to task->exec_state at execve, narrowed by would_dump(). */
+ struct user_namespace *user_ns;
unsigned long p; /* current top of mem */
unsigned int
/* Should an execfd be passed to userspace? */
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 68861da4cf7c..7b38ee2e7913 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/sched/coredump.h>
#include <asm/siginfo.h>
#ifdef CONFIG_COREDUMP
@@ -20,7 +21,10 @@ struct coredump_params {
const kernel_siginfo_t *siginfo;
struct file *file;
unsigned long limit;
+ /* MMF_DUMP_FILTER_* bits, snapshot of mm->flags at dump start. */
unsigned long mm_flags;
+ /* Snapshot of dumpable at dump start. */
+ enum task_dumpable dumpable;
int cpu;
loff_t written;
loff_t pos;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a308e2c23b82..9588ce3b16df 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1342,7 +1342,6 @@ struct mm_struct {
*/
struct task_struct __rcu *owner;
#endif
- struct user_namespace *user_ns;
/* store ref to file /proc/<pid>/exe symlink points to */
struct file __rcu *exe_file;
@@ -1907,11 +1906,11 @@ enum {
/* mm flags */
/*
- * The first two bits represent core dump modes for set-user-ID,
- * the modes are SUID_DUMP_* defined in linux/sched/coredump.h
+ * Bits 0 and 1 were dumpability; that moved to task->exec_state. Reserve
+ * the bits so MMF_DUMP_FILTER_* positions stay stable for the
+ * /proc/<pid>/coredump_filter ABI.
*/
#define MMF_DUMPABLE_BITS 2
-#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1)
/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE 2
#define MMF_DUMP_ANON_SHARED 3
@@ -1972,7 +1971,7 @@ enum {
#define MMF_TOPDOWN 31 /* mm searches top down by default */
#define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN)
-#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+#define MMF_INIT_LEGACY_MASK (MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 90507d4afcd6..ef314f7a9ecc 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -17,6 +17,7 @@ struct syscall_info {
struct seccomp_data data;
};
+bool ptracer_access_allowed(struct task_struct *tsk);
extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ee06cba5c6f5..258cb075478d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -85,6 +85,7 @@ struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
+struct task_exec_state;
struct task_group;
struct task_struct;
struct timespec64;
@@ -962,6 +963,8 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
+ struct task_exec_state __rcu *exec_state;
+
int exit_state;
int exit_code;
int exit_signal;
@@ -1002,9 +1005,6 @@ struct task_struct {
unsigned sched_rt_mutex:1;
#endif
- /* Save user-dumpable when mm goes away */
- unsigned user_dumpable:1;
-
/* Bit to tell TOMOYO we're in execve(): */
unsigned in_execve:1;
unsigned in_iowait:1;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 624fda17a785..20957ccde3b5 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -2,43 +2,18 @@
#ifndef _LINUX_SCHED_COREDUMP_H
#define _LINUX_SCHED_COREDUMP_H
-#include <linux/mm_types.h>
-
-#define SUID_DUMP_DISABLE 0 /* No setuid dumping */
-#define SUID_DUMP_USER 1 /* Dump as user of process */
-#define SUID_DUMP_ROOT 2 /* Dump as root */
-
-static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm)
-{
- /*
- * By convention, dumpable bits are contained in first 32 bits of the
- * bitmap, so we can simply access this first unsigned long directly.
- */
- return __mm_flags_get_word(mm);
-}
-
-static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value)
-{
- __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value);
-}
-
-extern void set_dumpable(struct mm_struct *mm, int value);
/*
- * This returns the actual value of the suid_dumpable flag. For things
- * that are using this for checking for privilege transitions, it must
- * test against SUID_DUMP_USER rather than treating it as a boolean
- * value.
+ * Task dumpability mode. Gates core dump production and ptrace_attach()
+ * authorization. The numeric values are stable ABI (suid_dumpable
+ * sysctl, prctl(PR_SET_DUMPABLE)); do not renumber.
*/
-static inline int __get_dumpable(unsigned long mm_flags)
-{
- return mm_flags & MMF_DUMPABLE_MASK;
-}
-
-static inline int get_dumpable(struct mm_struct *mm)
-{
- unsigned long flags = __mm_flags_get_dumpable(mm);
-
- return __get_dumpable(flags);
-}
+enum task_dumpable {
+ TASK_DUMPABLE_OFF = 0, /* no dump; ptrace needs CAP_SYS_PTRACE */
+ TASK_DUMPABLE_OWNER = 1, /* default; dump and ptrace by uid match */
+ TASK_DUMPABLE_ROOT = 2, /* dump as root; ptrace needs CAP_SYS_PTRACE */
+};
+
+void task_exec_state_set_dumpable(enum task_dumpable value);
+enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task);
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/linux/sched/exec_state.h b/include/linux/sched/exec_state.h
new file mode 100644
index 000000000000..9b61782510b8
--- /dev/null
+++ b/include/linux/sched/exec_state.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+#ifndef _LINUX_SCHED_EXEC_STATE_H
+#define _LINUX_SCHED_EXEC_STATE_H
+
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
+#include <linux/sched/coredump.h>
+#include <linux/user_namespace.h>
+
+struct task_exec_state {
+ refcount_t count;
+ enum task_dumpable dumpable;
+ struct user_namespace *user_ns;
+ struct rcu_head rcu;
+};
+
+extern struct task_exec_state init_task_exec_state;
+
+struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns);
+void put_task_exec_state(struct task_exec_state *exec_state);
+struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk);
+struct task_exec_state *task_exec_state_replace(struct task_struct *tsk,
+ struct task_exec_state *exec_state);
+int task_exec_state_copy(struct task_struct *tsk);
+void __init exec_state_init(void);
+
+DEFINE_FREE(put_task_exec_state, struct task_exec_state *, put_task_exec_state(_T))
+
+#endif /* _LINUX_SCHED_EXEC_STATE_H */
diff --git a/init/init_task.c b/init/init_task.c
index b5f48ebdc2b6..8cad78da469c 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -7,6 +7,8 @@
#include <linux/sched/rt.h>
#include <linux/sched/task.h>
#include <linux/sched/ext.h>
+#include <linux/sched/exec_state.h>
+#include <linux/user_namespace.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -56,6 +58,13 @@ static struct sighand_struct init_sighand = {
.signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh),
};
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct task_exec_state init_task_exec_state = {
+ .count = REFCOUNT_INIT(2),
+ .dumpable = TASK_DUMPABLE_OWNER,
+ .user_ns = &init_user_ns,
+};
+
#ifdef CONFIG_SHADOW_CALL_STACK
unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
[(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
@@ -113,6 +122,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
+ .exec_state = &init_task_exec_state,
.restart_block = {
.fn = do_no_restart_syscall,
},
diff --git a/kernel/Makefile b/kernel/Makefile
index 6785982013dc..1e1a31673577 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,7 +3,7 @@
# Makefile for the linux kernel.
#
-obj-y = fork.o exec_domain.o panic.o \
+obj-y = fork.o exec_domain.o exec_state.o panic.o \
cpu.o exit.o softirq.o resource.o \
sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
diff --git a/kernel/cred.c b/kernel/cred.c
index 12a7b1ce5131..3df4e15bd67f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -384,8 +384,9 @@ int commit_creds(struct cred *new)
!uid_eq(old->fsuid, new->fsuid) ||
!gid_eq(old->fsgid, new->fsgid) ||
!cred_cap_issubset(old, new)) {
+ /* mm-less tasks share init_task's exec_state */
if (task->mm)
- set_dumpable(task->mm, suid_dumpable);
+ task_exec_state_set_dumpable(suid_dumpable);
task->pdeath_signal = 0;
/*
* If a task drops privileges and becomes nondumpable,
diff --git a/kernel/exec_state.c b/kernel/exec_state.c
new file mode 100644
index 000000000000..6034f4b4808f
--- /dev/null
+++ b/kernel/exec_state.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/refcount.h>
+#include <linux/sched.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/exec_state.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+
+static struct kmem_cache *task_exec_state_cachep;
+
+static void __free_task_exec_state(struct rcu_head *rcu)
+{
+ struct task_exec_state *exec_state = container_of(rcu, struct task_exec_state, rcu);
+
+ put_user_ns(exec_state->user_ns);
+ kmem_cache_free(task_exec_state_cachep, exec_state);
+}
+
+void put_task_exec_state(struct task_exec_state *exec_state)
+{
+ if (exec_state && refcount_dec_and_test(&exec_state->count))
+ call_rcu(&exec_state->rcu, __free_task_exec_state);
+}
+
+struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns)
+{
+ struct task_exec_state *exec_state;
+
+ exec_state = kmem_cache_alloc(task_exec_state_cachep, GFP_KERNEL);
+ if (!exec_state)
+ return NULL;
+ refcount_set(&exec_state->count, 1);
+ exec_state->dumpable = TASK_DUMPABLE_OFF;
+ exec_state->user_ns = get_user_ns(user_ns);
+ return exec_state;
+}
+
+struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk)
+{
+ struct task_exec_state *exec_state;
+
+ exec_state = rcu_dereference_check(tsk->exec_state,
+ lockdep_is_held(&tsk->alloc_lock));
+ WARN_ON_ONCE(!exec_state);
+ return exec_state;
+}
+
+struct task_exec_state *task_exec_state_replace(struct task_struct *tsk,
+ struct task_exec_state *exec_state)
+{
+ /*
+ * Updates must hold both locks so callers needing a consistent
+ * snapshot of mm + dumpability are covered.
+ */
+ lockdep_assert_held(&tsk->alloc_lock);
+ lockdep_assert_held_write(&tsk->signal->exec_update_lock);
+
+ return rcu_replace_pointer(tsk->exec_state, exec_state, true);
+}
+
+/*
+ * The non-CLONE_VM clone path: allocate a fresh exec_state and
+ * inherit the parent's dumpable mode and user_ns reference. CLONE_VM
+ * siblings refcount-share via copy_exec_state() in fork.c; only this
+ * path and execve() ever allocate.
+ */
+int task_exec_state_copy(struct task_struct *tsk)
+{
+ struct task_exec_state *src, *dst;
+
+ src = rcu_dereference_protected(current->exec_state, true);
+ dst = alloc_task_exec_state(src->user_ns);
+ if (!dst)
+ return -ENOMEM;
+ dst->dumpable = READ_ONCE(src->dumpable);
+ rcu_assign_pointer(tsk->exec_state, dst);
+ return 0;
+}
+
+/*
+ * Store TASK_DUMPABLE_* on current->exec_state. All callers
+ * (commit_creds, begin_new_exec, prctl(PR_SET_DUMPABLE)) act on the
+ * running task, which guarantees ->exec_state is allocated and cannot
+ * be replaced under us.
+ */
+void task_exec_state_set_dumpable(enum task_dumpable value)
+{
+ struct task_exec_state *exec_state;
+
+ if (WARN_ON_ONCE(value > TASK_DUMPABLE_ROOT))
+ value = TASK_DUMPABLE_OFF;
+
+ exec_state = rcu_dereference_protected(current->exec_state, true);
+ /* mm-less tasks share init_task's exec_state; never mutate it */
+ if (WARN_ON_ONCE(exec_state == &init_task_exec_state))
+ return;
+ WRITE_ONCE(exec_state->dumpable, value);
+}
+
+enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task)
+{
+ struct task_exec_state *exec_state;
+
+ guard(rcu)();
+ exec_state = rcu_dereference(task->exec_state);
+ return READ_ONCE(exec_state->dumpable);
+}
+
+void __init exec_state_init(void)
+{
+ task_exec_state_cachep = kmem_cache_create("task_exec_state",
+ sizeof(struct task_exec_state), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+ NULL);
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index f50d73c272d6..9a909993ab1d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -571,7 +571,6 @@ static void exit_mm(void)
*/
smp_mb__after_spinlock();
local_irq_disable();
- current->user_dumpable = (get_dumpable(mm) == SUID_DUMP_USER);
current->mm = NULL;
membarrier_update_current_mm(NULL);
enter_lazy_tlb(mm, current);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8ac38beae360..ba6b03d4a85c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/sched/ext.h>
+#include <linux/sched/exec_state.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
@@ -555,6 +556,7 @@ void free_task(struct task_struct *tsk)
if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
bpf_task_storage_free(tsk);
+ put_task_exec_state(rcu_access_pointer(tsk->exec_state));
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -731,7 +733,6 @@ void __mmdrop(struct mm_struct *mm)
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
- put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
mm_destroy_cid(mm);
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
@@ -946,6 +947,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->seccomp.filter = NULL;
#endif
+ RCU_INIT_POINTER(tsk->exec_state, NULL);
+
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
@@ -1072,8 +1075,7 @@ static void mmap_init_lock(struct mm_struct *mm)
#endif
}
-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
- struct user_namespace *user_ns)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
@@ -1132,7 +1134,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
NR_MM_COUNTERS))
goto fail_pcpu;
- mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
@@ -1163,7 +1164,7 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- return mm_init(mm, current, current_user_ns());
+ return mm_init(mm, current);
}
EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
@@ -1527,7 +1528,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
memcpy(mm, oldmm, sizeof(*mm));
- if (!mm_init(mm, tsk, mm->user_ns))
+ if (!mm_init(mm, tsk))
goto fail_nomem;
uprobe_start_dup_mmap();
@@ -1593,6 +1594,22 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)
return 0;
}
+static int copy_exec_state(u64 clone_flags, struct task_struct *tsk)
+{
+ struct task_exec_state *exec_state;
+
+ /* CLONE_VM siblings refcount-share the parent's exec_state. */
+ if (clone_flags & CLONE_VM) {
+ exec_state = rcu_dereference_protected(current->exec_state, true);
+ refcount_inc(&exec_state->count);
+ rcu_assign_pointer(tsk->exec_state, exec_state);
+ return 0;
+ }
+
+ /* Everyone else inherits a fresh copy. */
+ return task_exec_state_copy(tsk);
+}
+
static int copy_fs(u64 clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
@@ -2090,6 +2107,9 @@ __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
+ retval = copy_exec_state(clone_flags, p);
+ if (retval)
+ goto bad_fork_free;
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
@@ -3097,6 +3117,7 @@ void __init proc_caches_init(void)
sizeof(struct signal_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
+ exec_state_init();
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 791210daf8b4..63beb59b7a3d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1619,7 +1619,6 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
- WARN_ON_ONCE(!mm->user_ns);
/*
* It is possible for mm to be the same as tsk->active_mm, but
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 130043bfc209..d041645d9d17 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
+#include <linux/sched/exec_state.h>
#include <linux/sched/task.h>
#include <linux/errno.h>
#include <linux/mm.h>
@@ -36,6 +37,30 @@
#include <asm/syscall.h> /* for syscall_get_* */
+/**
+ * ptracer_access_allowed - may current peek/poke @tsk's address space?
+ * @tsk: tracee
+ *
+ * Per-access check used by ptrace_access_vm() and architecture-specific
+ * tag/register accessors. Returns true iff current is the registered
+ * ptracer of @tsk and either @tsk is owner-dumpable or current holds
+ * CAP_SYS_PTRACE in @tsk's exec namespace. Lighter than
+ * __ptrace_may_access(): it re-validates only dumpability and
+ * capability on every access, without re-running LSM hooks or
+ * cred_cap_issubset() checks performed at attach time.
+ */
+bool ptracer_access_allowed(struct task_struct *tsk)
+{
+ const struct task_exec_state *es;
+
+ guard(rcu)();
+ if (ptrace_parent(tsk) != current)
+ return false;
+ es = task_exec_state_rcu(tsk);
+ return READ_ONCE(es->dumpable) == TASK_DUMPABLE_OWNER ||
+ ptracer_capable(tsk, es->user_ns);
+}
+
/*
* Access another process' address space via ptrace.
* Source/target buffer must be kernel space,
@@ -45,21 +70,14 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
{
struct mm_struct *mm;
- int ret;
+ int ret = 0;
mm = get_task_mm(tsk);
if (!mm)
return 0;
- if (!tsk->ptrace ||
- (current != tsk->parent) ||
- ((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptracer_capable(tsk, mm->user_ns))) {
- mmput(mm);
- return 0;
- }
-
- ret = access_remote_vm(mm, addr, buf, len, gup_flags);
+ if (ptracer_access_allowed(tsk))
+ ret = access_remote_vm(mm, addr, buf, len, gup_flags);
mmput(mm);
return ret;
@@ -274,16 +292,13 @@ static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
static bool task_still_dumpable(struct task_struct *task, unsigned int mode)
{
- struct mm_struct *mm = task->mm;
- if (mm) {
- if (get_dumpable(mm) == SUID_DUMP_USER)
- return true;
- return ptrace_has_cap(mm->user_ns, mode);
- }
+ const struct task_exec_state *exec_state;
- if (task->user_dumpable)
+ guard(rcu)();
+ exec_state = task_exec_state_rcu(task);
+ if (READ_ONCE(exec_state->dumpable) == TASK_DUMPABLE_OWNER)
return true;
- return ptrace_has_cap(&init_user_ns, mode);
+ return ptrace_has_cap(exec_state->user_ns, mode);
}
/* Returns 0 on success, -errno on denial. */
diff --git a/kernel/sys.c b/kernel/sys.c
index 62e842055cc9..df69bd71de03 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2565,14 +2565,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = put_user(me->pdeath_signal, (int __user *)arg2);
break;
case PR_GET_DUMPABLE:
- error = get_dumpable(me->mm);
+ error = task_exec_state_get_dumpable(me);
break;
case PR_SET_DUMPABLE:
- if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
+ if (arg2 != TASK_DUMPABLE_OFF && arg2 != TASK_DUMPABLE_OWNER) {
error = -EINVAL;
break;
}
- set_dumpable(me->mm, arg2);
+ task_exec_state_set_dumpable(arg2);
break;
case PR_SET_UNALIGN:
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c5556bb9d5f0..3e792aad7626 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -43,7 +43,6 @@ struct mm_struct init_mm = {
.vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait),
.mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif
- .user_ns = &init_user_ns,
#ifdef CONFIG_SCHED_MM_CID
.mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(init_mm.mm_cid.lock),
#endif