diff options
| author | Christian Brauner <brauner@kernel.org> | 2026-04-27 17:49:01 +0300 |
|---|---|---|
| committer | Christian Brauner <brauner@kernel.org> | 2026-05-12 00:13:01 +0300 |
| commit | e75c21d5ad82def93bc77e9aa41c2212964a8d2f (patch) | |
| tree | d4b6f54356baa327dcd14355520d91ed94d1fa76 | |
| parent | 254f49634ee16a731174d2ae34bc50bd5f45e731 (diff) | |
| parent | 36b3306779ea58f994a614b3663a196707a66ce2 (diff) | |
| download | linux-e75c21d5ad82def93bc77e9aa41c2212964a8d2f.tar.xz | |
Merge patch series "revamp fs/filesystems.c"
Mateusz Guzik <mjguzik@gmail.com> says:
The file is a mess with a hand-rolled linked list in a desperate need of
a clean up.
The code to emit /proc/filesystems is used frequently because libselinux
reads the file, which in turn is linked into numerous frequently used
programs (even ones you would not suspect, like sed!). In order to
combat that pre-gen the string instead of pointer-chasing and printfing
one by-one.
open+read+close cycle single-threaded (ops/s):
before: 442732
after: 1063462 (+140%)
Additionally scalability is also improved thanks to bypassing ref
maintenance on open/close.
open+read+close cycle with 20 processes (ops/s):
before: 606177
after: 3300576 (+444%)
The main bottleneck afterwards is the spurious lockref trip on open.
* patches from https://patch.msgid.link/20260425220844.1763933-1-mjguzik@gmail.com:
fs: cache the string generated by reading /proc/filesystems
fs: RCU-ify filesystems list
proc: allow to mark /proc files permanent outside of fs/proc/
Link: https://patch.msgid.link/20260425220844.1763933-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
| -rw-r--r-- | fs/filesystems.c | 332 | ||||
| -rw-r--r-- | fs/ocfs2/super.c | 1 | ||||
| -rw-r--r-- | fs/proc/generic.c | 10 | ||||
| -rw-r--r-- | fs/proc/internal.h | 3 | ||||
| -rw-r--r-- | include/linux/fs.h | 2 | ||||
| -rw-r--r-- | include/linux/proc_fs.h | 12 |
6 files changed, 252 insertions, 108 deletions
diff --git a/fs/filesystems.c b/fs/filesystems.c index 0c7d2b7ac26c..771fc31a69b8 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -17,22 +17,49 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> +#include <linux/rculist.h> /* - * Handling of filesystem drivers list. - * Rules: - * Inclusion to/removals from/scanning of list are protected by spinlock. - * During the unload module must call unregister_filesystem(). - * We can access the fields of list element if: - * 1) spinlock is held or - * 2) we hold the reference to the module. - * The latter can be guaranteed by call of try_module_get(); if it - * returned 0 we must skip the element, otherwise we got the reference. - * Once the reference is obtained we can drop the spinlock. + * Read-mostly filesystem drivers list. + * + * Readers walk under rcu_read_lock(); writers take file_systems_lock + * and publish via _rcu hlist primitives. unregister_filesystem() + * synchronize_rcu()s after unlock so the embedded file_system_type + * can't go away under a reader. To keep using a filesystem after + * the RCU section ends, take a module reference via try_module_get(). + */ +static HLIST_HEAD(file_systems); +static DEFINE_SPINLOCK(file_systems_lock); + +#ifdef CONFIG_PROC_FS +/* + * Cache a stringified version of the filesystem list. + * + * The fs list gets queried a lot by userspace because of libselinux, including + * rather surprising programs (would you guess *sed* is on the list?). In order + * to reduce the overhead we cache the resulting string, which normally hangs + * around below 512 bytes in size. + * + * As the list almost never changes, its creation is not particularly optimized + * to keep things simple. + * + * We sort it out on read in order to not introduce a failure point for fs + * registration (in principle we may be unable to alloc memory for the list). */ +struct file_systems_string { + struct rcu_head rcu; + unsigned long gen; + size_t len; + char string[]; +}; + +static unsigned long file_systems_gen; +static struct file_systems_string __rcu *file_systems_string; -static struct file_system_type *file_systems; -static DEFINE_RWLOCK(file_systems_lock); +static void invalidate_filesystems_string(void); +#else +static inline void invalidate_filesystems_string(void) { } +#endif /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) @@ -46,14 +73,15 @@ void put_filesystem(struct file_system_type *fs) module_put(fs->owner); } -static struct file_system_type **find_filesystem(const char *name, unsigned len) +static struct file_system_type *find_filesystem(const char *name, unsigned len) { - struct file_system_type **p; - for (p = &file_systems; *p; p = &(*p)->next) - if (strncmp((*p)->name, name, len) == 0 && - !(*p)->name[len]) - break; - return p; + struct file_system_type *fs; + + hlist_for_each_entry_rcu(fs, &file_systems, list, + lockdep_is_held(&file_systems_lock)) + if (strncmp(fs->name, name, len) == 0 && !fs->name[len]) + return fs; + return NULL; } /** @@ -64,33 +92,27 @@ static struct file_system_type **find_filesystem(const char *name, unsigned len) * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * - * The &struct file_system_type that is passed is linked into the kernel + * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ - -int register_filesystem(struct file_system_type * fs) +int register_filesystem(struct file_system_type *fs) { - int res = 0; - struct file_system_type ** p; - if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); - if (fs->next) + if (!hlist_unhashed_lockless(&fs->list)) return -EBUSY; - write_lock(&file_systems_lock); - p = find_filesystem(fs->name, strlen(fs->name)); - if (*p) - res = -EBUSY; - else - *p = fs; - write_unlock(&file_systems_lock); - return res; -} + guard(spinlock)(&file_systems_lock); + if (find_filesystem(fs->name, strlen(fs->name))) + return -EBUSY; + hlist_add_tail_rcu(&fs->list, &file_systems); + invalidate_filesystems_string(); + return 0; +} EXPORT_SYMBOL(register_filesystem); /** @@ -100,94 +122,79 @@ EXPORT_SYMBOL(register_filesystem); * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. - * + * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ - -int unregister_filesystem(struct file_system_type * fs) +int unregister_filesystem(struct file_system_type *fs) { - struct file_system_type ** tmp; - - write_lock(&file_systems_lock); - tmp = &file_systems; - while (*tmp) { - if (fs == *tmp) { - *tmp = fs->next; - fs->next = NULL; - write_unlock(&file_systems_lock); - synchronize_rcu(); - return 0; - } - tmp = &(*tmp)->next; + scoped_guard(spinlock, &file_systems_lock) { + if (hlist_unhashed(&fs->list)) + return -EINVAL; + hlist_del_init_rcu(&fs->list); + invalidate_filesystems_string(); } - write_unlock(&file_systems_lock); - - return -EINVAL; + synchronize_rcu(); + return 0; } - EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL -static int fs_index(const char __user * __name) +static int fs_index(const char __user *__name) { - struct file_system_type * tmp; + struct file_system_type *p; char *name __free(kfree) = strndup_user(__name, PATH_MAX); - int err, index; + int index = 0; if (IS_ERR(name)) return PTR_ERR(name); - err = -EINVAL; - read_lock(&file_systems_lock); - for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { - if (strcmp(tmp->name, name) == 0) { - err = index; - break; - } + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (strcmp(p->name, name) == 0) + return index; + index++; } - read_unlock(&file_systems_lock); - return err; + return -EINVAL; } -static int fs_name(unsigned int index, char __user * buf) +static int fs_name(unsigned int index, char __user *buf) { - struct file_system_type * tmp; - int len, res = -EINVAL; - - read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) { - if (index == 0) { - if (try_module_get(tmp->owner)) - res = 0; + struct file_system_type *p, *found = NULL; + int len, res; + + scoped_guard(rcu) { + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (index--) + continue; + if (try_module_get(p->owner)) + found = p; break; } } - read_unlock(&file_systems_lock); - if (res) - return res; + if (!found) + return -EINVAL; /* OK, we got the reference, so we can safely block */ - len = strlen(tmp->name) + 1; - res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; - put_filesystem(tmp); + len = strlen(found->name) + 1; + res = copy_to_user(buf, found->name, len) ? -EFAULT : 0; + put_filesystem(found); return res; } static int fs_maxindex(void) { - struct file_system_type * tmp; - int index; + struct file_system_type *p; + int index = 0; - read_lock(&file_systems_lock); - for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) - ; - read_unlock(&file_systems_lock); + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) + index++; return index; } /* - * Whee.. Weird sysv syscall. + * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { @@ -216,8 +223,8 @@ int __init list_bdev_fs_names(char *buf, size_t size) size_t len; int count = 0; - read_lock(&file_systems_lock); - for (p = file_systems; p; p = p->next) { + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; @@ -230,30 +237,145 @@ int __init list_bdev_fs_names(char *buf, size_t size) size -= len; count++; } - read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS -static int filesystems_proc_show(struct seq_file *m, void *v) +static void invalidate_filesystems_string(void) +{ + struct file_systems_string *old; + + lockdep_assert_held_write(&file_systems_lock); + file_systems_gen++; + old = rcu_replace_pointer(file_systems_string, NULL, + lockdep_is_held(&file_systems_lock)); + if (old) + kfree_rcu(old, rcu); +} + +static __cold noinline int regen_filesystems_string(void) +{ + struct file_system_type *p; + struct file_systems_string *old, *new; + size_t newlen, usedlen; + unsigned long gen; + +retry: + newlen = 0; + + /* pre-calc space for each fs */ + spin_lock(&file_systems_lock); + gen = file_systems_gen; + hlist_for_each_entry_rcu(p, &file_systems, list) { + if (!(p->fs_flags & FS_REQUIRES_DEV)) + newlen += strlen("nodev"); + newlen += strlen("\t") + strlen(p->name) + strlen("\n"); + } + spin_unlock(&file_systems_lock); + + new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1, + GFP_KERNEL); + if (!new) + return -ENOMEM; + + new->gen = gen; + new->len = newlen; + new->string[newlen] = '\0'; + + spin_lock(&file_systems_lock); + old = file_systems_string; + + /* + * Did someone beat us to it? + */ + if (old && old->gen == file_systems_gen) { + kfree(new); + return 0; + } + + /* + * Did the list change in the meantime? + */ + if (gen != file_systems_gen) { + kfree(new); + goto retry; + } + + /* + * Populate the string. + * + * We know we have just enough space because we calculated the right + * size the previous time we had the lock and confirmed the list has + * not changed after reacquiring it. + */ + usedlen = 0; + hlist_for_each_entry_rcu(p, &file_systems, list) { + usedlen += sprintf(&new->string[usedlen], "%s\t%s\n", + (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + p->name); + } + + if (WARN_ON_ONCE(new->len != strlen(new->string))) { + /* + * Should never happen of course, keep this in case someone changes string + * generation above and messes it up. + */ + spin_unlock(&file_systems_lock); + if (old) + kfree_rcu(old, rcu); + return -EINVAL; + } + + /* + * Paired with consume fence in READ_ONCE() in filesystems_proc_show() + */ + smp_store_release(&file_systems_string, new); + spin_unlock(&file_systems_lock); + if (old) + kfree_rcu(old, rcu); + return 0; +} + +static __cold noinline int filesystems_proc_show_fallback(struct seq_file *m, void *v) { - struct file_system_type * tmp; + struct file_system_type *p; - read_lock(&file_systems_lock); - tmp = file_systems; - while (tmp) { + guard(rcu)(); + hlist_for_each_entry_rcu(p, &file_systems, list) { seq_printf(m, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); - tmp = tmp->next; + (p->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", + p->name); } - read_unlock(&file_systems_lock); return 0; } +static int filesystems_proc_show(struct seq_file *m, void *v) +{ + struct file_systems_string *fss; + + for (;;) { + scoped_guard(rcu) { + fss = rcu_dereference(file_systems_string); + if (likely(fss)) { + seq_write(m, fss->string, fss->len); + return 0; + } + } + + int err = regen_filesystems_string(); + if (unlikely(err)) + return filesystems_proc_show_fallback(m, v); + } +} + static int __init proc_filesystems_init(void) { - proc_create_single("filesystems", 0, NULL, filesystems_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("filesystems", 0, NULL, filesystems_proc_show); + if (!pde) + return -ENOMEM; + proc_make_permanent(pde); return 0; } module_init(proc_filesystems_init); @@ -263,11 +385,10 @@ static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; - read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); + guard(rcu)(); + fs = find_filesystem(name, len); if (fs && !try_module_get(fs->owner)) fs = NULL; - read_unlock(&file_systems_lock); return fs; } @@ -291,5 +412,4 @@ struct file_system_type *get_fs_type(const char *name) } return fs; } - EXPORT_SYMBOL(get_fs_type); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b875f01c9756..4870e680c4e5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1224,7 +1224,6 @@ static struct file_system_type ocfs2_fs_type = { .name = "ocfs2", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, - .next = NULL, .init_fs_context = ocfs2_init_fs_context, .parameters = ocfs2_param_spec, }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8bb81e58c9d8..c6ae076e1fa0 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -841,3 +841,13 @@ ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size, kfree(buf); return ret == 0 ? size : ret; } + +/* + * Not exported to modules: + * modules' /proc files aren't permanent because modules aren't permanent. + */ +void impl_proc_make_permanent(struct proc_dir_entry *pde) +{ + if (pde) + pde_make_permanent(pde); +} diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 64dc44832808..1edbabbdbc5d 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,8 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +/* This is for builtin code, not even for modules which are compiled in. */ static inline void pde_make_permanent(struct proc_dir_entry *pde) { + /* Ensure magic flag does something. */ + static_assert(PROC_ENTRY_PERMANENT != 0); pde->flags |= PROC_ENTRY_PERMANENT; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 11559c513dfb..c37bb3c7de8b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2286,7 +2286,7 @@ struct file_system_type { const struct fs_parameter_spec *parameters; void (*kill_sb) (struct super_block *); struct module *owner; - struct file_system_type * next; + struct hlist_node list; struct hlist_head fs_supers; struct lock_class_key s_lock_key; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 19d1c5e5f335..d2860c18dca9 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -248,4 +248,16 @@ static inline struct pid_namespace *proc_pid_ns(struct super_block *sb) bool proc_ns_file(const struct file *file); +#if defined CONFIG_PROC_FS && !defined MODULE +void impl_proc_make_permanent(struct proc_dir_entry *pde); +#endif + +static inline void proc_make_permanent(struct proc_dir_entry *pde) +{ + /* Don't give matches to modules. */ +#if defined CONFIG_PROC_FS && !defined MODULE + impl_proc_make_permanent(pde); +#endif +} + #endif /* _LINUX_PROC_FS_H */ |
