diff options
author | Christian Brauner <brauner@kernel.org> | 2025-03-05 20:26:07 +0300 |
---|---|---|
committer | Christian Brauner <brauner@kernel.org> | 2025-03-05 20:31:24 +0300 |
commit | dba2e3b788f5ac70ebf717523433e1ceae3f0834 (patch) | |
tree | d1a0540a062e74fb8b3f8907e7b1494368328206 | |
parent | 23e490336467fcdaf95e1efcf8f58067b59f647b (diff) | |
parent | 606623de503f4eaa4ca505650bbd2da1f5fb5e6a (diff) | |
download | linux-dba2e3b788f5ac70ebf717523433e1ceae3f0834.tar.xz |
Merge patch series "avoid the extra atomic on a ref when closing a fd"
Mateusz Guzik <mjguzik@gmail.com> says:
The stock kernel transitioning the file to no refs held penalizes the
caller with an extra atomic to block any increments.
For cases where the file is highly likely to be going away this is
easily avoidable.
In the open+close case the win is very modest because of the following
problems:
- kmem and memcg having terrible performance
- putname using an atomic (I have a wip to whack that)
- open performing an extra ref/unref on the dentry (there are patches to
do it, including by Al. I mailed about them in [1])
- creds using atomics (I have a wip to whack that)
- apparmor using atomics (ditto, same mechanism)
On top of that I have a WIP patch to dodge some of the work at lookup
itself.
All in all there is several % avoidably lost here.
stats colected during a kernel build with:
bpftrace -e 'kprobe:filp_close,kprobe:fput,kprobe:fput_close* { @[probe] = hist(((struct file *)arg0)->f_ref.refcnt.counter > 0); }'
@[kprobe:filp_close]:
[0] 32195 |@@@@@@@@@@ |
[1] 164567 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
@[kprobe:fput]:
[0] 339240 |@@@@@@ |
[1] 2888064 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
@[kprobe:fput_close]:
[0] 5116767 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1] 164544 |@ |
@[kprobe:fput_close_sync]:
[0] 5340660 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1] 358943 |@@@ |
0 indicates the last reference, 1 that there is more.
filp_close is largely skewed because of close_on_exec.
vast majority of last fputs are from remove_vma. I think that code wants
to be patched to batch them (as in something like fput_many should be
added -- something for later).
[1] https://lore.kernel.org/linux-fsdevel/20250304165728.491785-1-mjguzik@gmail.com/T/#u
* patches from https://lore.kernel.org/r/20250305123644.554845-1-mjguzik@gmail.com:
fs: use fput_close() in path_openat()
fs: use fput_close() in filp_close()
fs: use fput_close_sync() in close()
file: add fput and file_ref_put routines optimized for use when closing a fd
Link: https://lore.kernel.org/r/20250305123644.554845-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
-rw-r--r-- | fs/file.c | 41 | ||||
-rw-r--r-- | fs/file_table.c | 70 | ||||
-rw-r--r-- | fs/internal.h | 3 | ||||
-rw-r--r-- | fs/namei.c | 2 | ||||
-rw-r--r-- | fs/open.c | 4 | ||||
-rw-r--r-- | include/linux/file_ref.h | 34 |
6 files changed, 112 insertions, 42 deletions
diff --git a/fs/file.c b/fs/file.c index d868cdb95d1e..3fef798b96e5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -26,6 +26,28 @@ #include "internal.h" +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) +{ + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} + /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count @@ -67,24 +89,7 @@ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) return true; } - /* - * If the reference count was already in the dead zone, then this - * put() operation is imbalanced. Warn, put the reference count back to - * DEAD and tell the caller to not deconstruct the object. - */ - if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { - atomic_long_set(&ref->refcnt, FILE_REF_DEAD); - return false; - } - - /* - * This is a put() operation on a saturated refcount. Restore the - * mean saturation value and tell the caller to not deconstruct the - * object. - */ - if (cnt > FILE_REF_MAXREF) - atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); - return false; + return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); diff --git a/fs/file_table.c b/fs/file_table.c index f0291a66f9db..18debb7bd285 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -495,31 +495,37 @@ void flush_delayed_fput(void) } EXPORT_SYMBOL_GPL(flush_delayed_fput); -void fput(struct file *file) +static void __fput_deferred(struct file *file) { - if (file_ref_put(&file->f_ref)) { - struct task_struct *task = current; + struct task_struct *task = current; + + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } - if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { - file_free(file); + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; - } - if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_task_work, ____fput); - if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) - return; - /* - * After this task has run exit_task_work(), - * task_work_add() will fail. Fall through to delayed - * fput to avoid leaking *file. - */ - } - - if (llist_add(&file->f_llist, &delayed_fput_list)) - schedule_delayed_work(&delayed_fput_work, 1); + /* + * After this task has run exit_task_work(), + * task_work_add() will fail. Fall through to delayed + * fput to avoid leaking *file. + */ } + + if (llist_add(&file->f_llist, &delayed_fput_list)) + schedule_delayed_work(&delayed_fput_work, 1); } +void fput(struct file *file) +{ + if (unlikely(file_ref_put(&file->f_ref))) + __fput_deferred(file); +} +EXPORT_SYMBOL(fput); + /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without @@ -533,10 +539,32 @@ void __fput_sync(struct file *file) if (file_ref_put(&file->f_ref)) __fput(file); } - -EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); +/* + * Equivalent to __fput_sync(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close_sync(struct file *file) +{ + if (likely(file_ref_put_close(&file->f_ref))) + __fput(file); +} + +/* + * Equivalent to fput(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close(struct file *file) +{ + if (file_ref_put_close(&file->f_ref)) + __fput_deferred(file); +} + void __init files_init(void) { struct kmem_cache_args args = { diff --git a/fs/internal.h b/fs/internal.h index e7f02ae1e098..05c817f39a28 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -118,6 +118,9 @@ static inline void put_file_access(struct file *file) } } +void fput_close_sync(struct file *); +void fput_close(struct file *); + /* * super.c */ diff --git a/fs/namei.c b/fs/namei.c index 3ab9440c5b93..355a37097769 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3995,7 +3995,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - fput(file); + fput_close(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; diff --git a/fs/open.c b/fs/open.c index a8a5f843e3cf..39a89df7c37e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1550,7 +1550,7 @@ int filp_close(struct file *filp, fl_owner_t id) int retval; retval = filp_flush(filp, id); - fput(filp); + fput_close(filp); return retval; } @@ -1576,7 +1576,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) * We're returning to user space. Don't bother * with any delayed fput() cases. */ - __fput_sync(file); + fput_close_sync(file); if (likely(retval == 0)) return 0; diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h index 9b3a8d9b17ab..6ef92d765a66 100644 --- a/include/linux/file_ref.h +++ b/include/linux/file_ref.h @@ -61,6 +61,7 @@ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) atomic_long_set(&ref->refcnt, cnt - 1); } +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt); bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** @@ -161,6 +162,39 @@ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) } /** + * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF + * @ref: Pointer to the reference count + * + * Semantically it is equivalent to calling file_ref_put(), but it trades lower + * performance in face of other CPUs also modifying the refcount for higher + * performance when this happens to be the last reference. + * + * For the last reference file_ref_put() issues 2 atomics. One to drop the + * reference and another to transition it to FILE_REF_DEAD. This routine does + * the work in one step, but in order to do it has to pre-read the variable which + * decreases scalability. + * + * Use with close() et al, stick to file_ref_put() by default. + */ +static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) +{ + long old, new; + + old = atomic_long_read(&ref->refcnt); + do { + if (unlikely(old < 0)) + return __file_ref_put_badval(ref, old); + + if (old == FILE_REF_ONEREF) + new = FILE_REF_DEAD; + else + new = old - 1; + } while (!atomic_long_try_cmpxchg(&ref->refcnt, &old, new)); + + return new == FILE_REF_DEAD; +} + +/** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count * |