exec: free the old mm outside the exec locks

exec_mmap() installs the new mm and then tears the old one down while still holding exec_update_lock for writing -- and with cred_guard_mutex held all the way to setup_new_exec(): setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); Neither lock is needed for this. exec_update_lock only exists to make the mm swap atomic with the later commit_creds(), so that permission-checking readers (proc, ptrace, the futex robust list, perf, kcmp, mm_access()) never observe the new mm together with the old credentials. Those readers all operate on task->mm, i.e. the new mm after the swap; none looks at the detached old mm, its ->owner or signal->maxrss. cred_guard_mutex guards credential calculation and is equally irrelevant here. The cost is real: __mmput() runs exit_mmap() over the entire old address space and can block in exit_aio() waiting for in-flight AIO, all while holding exec_update_lock for writing and cred_guard_mutex. For execve() of a large process this blocks ptrace_attach() and every exec_update_lock reader for the duration of the teardown. Stash the old mm in bprm->old_mm and release it from setup_new_exec() after both locks are dropped. setup_new_exec() still runs before setup_arg_pages() and the segment mappings, so the old address space is freed before the new one is populated and peak memory is unchanged. The ordering constraints are kept: old_mm's mmap_lock is still dropped in exec_mmap() before mm_update_next_owner() (required since commit 31a78f23bac0 ("mm owner: fix race between swapoff and exit")), and mm_update_next_owner() still precedes mmput(); both run in the execing task's context, as mm_update_next_owner() requires. If exec swaps the mm but fails before setup_new_exec() runs the old mm would leak, so add a backstop in free_bprm(). The lazy-tlb case (old_mm == NULL, e.g. kernel_execve()) has no address space to free and is left in exec_mmap(). Link: https://patch.msgid.link/20260522-work-exit_mm-v1-1-bd32d5a560bb@kernel.org Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
author: Christian Brauner <brauner@kernel.org> 2026-05-22 17:03:30 +0300
committer: Christian Brauner <brauner@kernel.org> 2026-05-26 12:02:02 +0300
commit: 38205ecbe6b6dc47968ad4e9c978e2117720969e (patch)
tree: c41998df5cb1af4d6965558bcbc732bd582f721c
parent: 4425cd76b5e73ce92bea9dc61a0027ef3d55c9f0 (diff)
download: linux-38205ecbe6b6dc47968ad4e9c978e2117720969e.tar.xz
2 files changed, 26 insertions, 7 deletions
diff --git a/fs/exec.c b/fs/exec.c
index 9e7f25e2cd41..824b46c069ae 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -836,16 +836,18 @@ EXPORT_SYMBOL(read_code);
 /*
  * Maps the mm_struct mm into the current task struct.
  * On success, this function returns with exec_update_lock
- * held for writing.
+ * held for writing. The replaced address space is stashed in
+ * bprm->old_mm for setup_new_exec() to release outside the lock.
  */
-static int exec_mmap(struct mm_struct *mm, struct user_namespace *user_ns)
+static int exec_mmap(struct linux_binprm *bprm)
 {
 	struct task_exec_state *exec_state __free(put_task_exec_state) = NULL;
+	struct mm_struct *mm = bprm->mm;
 	struct task_struct *tsk;
 	struct mm_struct *old_mm, *active_mm;
 	int ret;
 
-	exec_state = alloc_task_exec_state(user_ns);
+	exec_state = alloc_task_exec_state(bprm->user_ns);
 	if (!exec_state)
 		return -ENOMEM;
 
@@ -898,15 +900,22 @@ static int exec_mmap(struct mm_struct *mm, struct user_namespace *user_ns)
 	if (old_mm) {
 		mmap_read_unlock(old_mm);
 		BUG_ON(active_mm != old_mm);
-		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
-		mm_update_next_owner(old_mm);
-		mmput(old_mm);
+		/* Defer teardown to setup_new_exec(), outside the exec locks. */
+		bprm->old_mm = old_mm;
 		return 0;
 	}
 	mmdrop_lazy_tlb(active_mm);
 	return 0;
 }
 
+/* Release the address space replaced by exec, outside the exec locks. */
+static void exec_mm_put_old(struct mm_struct *old_mm)
+{
+	setmax_mm_hiwater_rss(&current->signal->maxrss, old_mm);
+	mm_update_next_owner(old_mm);
+	mmput(old_mm);
+}
+
 static int de_thread(struct task_struct *tsk)
 {
 	struct signal_struct *sig = tsk->signal;
@@ -1155,7 +1164,7 @@ int begin_new_exec(struct linux_binprm * bprm)
 	 * Release all of the old mmap stuff
 	 */
 	acct_arg_size(bprm, 0);
-	retval = exec_mmap(bprm->mm, bprm->user_ns);
+	retval = exec_mmap(bprm);
 	if (retval)
 		goto out;
 
@@ -1338,6 +1347,12 @@ void setup_new_exec(struct linux_binprm * bprm)
 	me->mm->task_size = TASK_SIZE;
 	up_write(&me->signal->exec_update_lock);
 	mutex_unlock(&me->signal->cred_guard_mutex);
+
+	/* The exec locks are dropped: release the old address space now. */
+	if (bprm->old_mm) {
+		exec_mm_put_old(bprm->old_mm);
+		bprm->old_mm = NULL;
+	}
 }
 EXPORT_SYMBOL(setup_new_exec);
 
@@ -1394,6 +1409,9 @@ static void free_bprm(struct linux_binprm *bprm)
 		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
+	/* exec swapped the mm but failed before setup_new_exec() freed it */
+	if (bprm->old_mm)
+		exec_mm_put_old(bprm->old_mm);
 	do_close_execat(bprm->file);
 	if (bprm->executable)
 		fput(bprm->executable);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index a8379f4eee61..2c77e383e737 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -25,6 +25,7 @@ struct linux_binprm {
 	struct page *page[MAX_ARG_PAGES];
 #endif
 	struct mm_struct *mm;
+	struct mm_struct *old_mm;	/* replaced address space, freed by setup_new_exec() */
 	/* user_ns published to task->exec_state at execve, narrowed by would_dump(). */
 	struct user_namespace *user_ns;
 	unsigned long p; /* current top of mem */
author	Christian Brauner <brauner@kernel.org>	2026-05-22 17:03:30 +0300
committer	Christian Brauner <brauner@kernel.org>	2026-05-26 12:02:02 +0300
commit	38205ecbe6b6dc47968ad4e9c978e2117720969e (patch)
tree	c41998df5cb1af4d6965558bcbc732bd582f721c
parent	4425cd76b5e73ce92bea9dc61a0027ef3d55c9f0 (diff)
download	linux-38205ecbe6b6dc47968ad4e9c978e2117720969e.tar.xz