summaryrefslogtreecommitdiff
path: root/fs/file_table.c
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2025-03-05 20:26:07 +0300
committerChristian Brauner <brauner@kernel.org>2025-03-05 20:31:24 +0300
commitdba2e3b788f5ac70ebf717523433e1ceae3f0834 (patch)
treed1a0540a062e74fb8b3f8907e7b1494368328206 /fs/file_table.c
parent23e490336467fcdaf95e1efcf8f58067b59f647b (diff)
parent606623de503f4eaa4ca505650bbd2da1f5fb5e6a (diff)
downloadlinux-dba2e3b788f5ac70ebf717523433e1ceae3f0834.tar.xz
Merge patch series "avoid the extra atomic on a ref when closing a fd"
Mateusz Guzik <mjguzik@gmail.com> says: The stock kernel transitioning the file to no refs held penalizes the caller with an extra atomic to block any increments. For cases where the file is highly likely to be going away this is easily avoidable. In the open+close case the win is very modest because of the following problems: - kmem and memcg having terrible performance - putname using an atomic (I have a wip to whack that) - open performing an extra ref/unref on the dentry (there are patches to do it, including by Al. I mailed about them in [1]) - creds using atomics (I have a wip to whack that) - apparmor using atomics (ditto, same mechanism) On top of that I have a WIP patch to dodge some of the work at lookup itself. All in all there is several % avoidably lost here. stats colected during a kernel build with: bpftrace -e 'kprobe:filp_close,kprobe:fput,kprobe:fput_close* { @[probe] = hist(((struct file *)arg0)->f_ref.refcnt.counter > 0); }' @[kprobe:filp_close]: [0] 32195 |@@@@@@@@@@ | [1] 164567 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| @[kprobe:fput]: [0] 339240 |@@@@@@ | [1] 2888064 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| @[kprobe:fput_close]: [0] 5116767 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [1] 164544 |@ | @[kprobe:fput_close_sync]: [0] 5340660 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [1] 358943 |@@@ | 0 indicates the last reference, 1 that there is more. filp_close is largely skewed because of close_on_exec. vast majority of last fputs are from remove_vma. I think that code wants to be patched to batch them (as in something like fput_many should be added -- something for later). [1] https://lore.kernel.org/linux-fsdevel/20250304165728.491785-1-mjguzik@gmail.com/T/#u * patches from https://lore.kernel.org/r/20250305123644.554845-1-mjguzik@gmail.com: fs: use fput_close() in path_openat() fs: use fput_close() in filp_close() fs: use fput_close_sync() in close() file: add fput and file_ref_put routines optimized for use when closing a fd Link: https://lore.kernel.org/r/20250305123644.554845-1-mjguzik@gmail.com Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'fs/file_table.c')
-rw-r--r--fs/file_table.c70
1 files changed, 49 insertions, 21 deletions
diff --git a/fs/file_table.c b/fs/file_table.c
index f0291a66f9db..18debb7bd285 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -495,31 +495,37 @@ void flush_delayed_fput(void)
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);
-void fput(struct file *file)
+static void __fput_deferred(struct file *file)
{
- if (file_ref_put(&file->f_ref)) {
- struct task_struct *task = current;
+ struct task_struct *task = current;
+
+ if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
+ file_free(file);
+ return;
+ }
- if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
- file_free(file);
+ if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
+ init_task_work(&file->f_task_work, ____fput);
+ if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
return;
- }
- if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
- init_task_work(&file->f_task_work, ____fput);
- if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
- return;
- /*
- * After this task has run exit_task_work(),
- * task_work_add() will fail. Fall through to delayed
- * fput to avoid leaking *file.
- */
- }
-
- if (llist_add(&file->f_llist, &delayed_fput_list))
- schedule_delayed_work(&delayed_fput_work, 1);
+ /*
+ * After this task has run exit_task_work(),
+ * task_work_add() will fail. Fall through to delayed
+ * fput to avoid leaking *file.
+ */
}
+
+ if (llist_add(&file->f_llist, &delayed_fput_list))
+ schedule_delayed_work(&delayed_fput_work, 1);
}
+void fput(struct file *file)
+{
+ if (unlikely(file_ref_put(&file->f_ref)))
+ __fput_deferred(file);
+}
+EXPORT_SYMBOL(fput);
+
/*
* synchronous analog of fput(); for kernel threads that might be needed
* in some umount() (and thus can't use flush_delayed_fput() without
@@ -533,10 +539,32 @@ void __fput_sync(struct file *file)
if (file_ref_put(&file->f_ref))
__fput(file);
}
-
-EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(__fput_sync);
+/*
+ * Equivalent to __fput_sync(), but optimized for being called with the last
+ * reference.
+ *
+ * See file_ref_put_close() for details.
+ */
+void fput_close_sync(struct file *file)
+{
+ if (likely(file_ref_put_close(&file->f_ref)))
+ __fput(file);
+}
+
+/*
+ * Equivalent to fput(), but optimized for being called with the last
+ * reference.
+ *
+ * See file_ref_put_close() for details.
+ */
+void fput_close(struct file *file)
+{
+ if (file_ref_put_close(&file->f_ref))
+ __fput_deferred(file);
+}
+
void __init files_init(void)
{
struct kmem_cache_args args = {