From d3ce197903279c5497d4fa9cc97b519400dc4a44 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 14 Jul 2021 17:15:59 +0200 Subject: sysfs: Use local reference in compat_only_sysfs_link_entry_to_kobj() As we have just obtained target_kobj->sd into a local variable, and incremented the object's reference count, it is better to use the local variable instead of the original reference. Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20210714151559.2532572-1-geert@linux-m68k.org Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 64e6a6698935..f29d62004527 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -446,7 +446,7 @@ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, if (!target) return -ENOENT; - entry = kernfs_find_and_get(target_kobj->sd, target_name); + entry = kernfs_find_and_get(target, target_name); if (!entry) { kernfs_put(target); return -ENOENT; -- cgit v1.2.3 From 895adbec302e92086359e6fd92611ac3be6d92c3 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 16 Jul 2021 17:28:18 +0800 Subject: kernfs: add a revision to identify directory node changes Add a revision counter to kernfs directory nodes so it can be used to detect if a directory node has changed during negative dentry revalidation. There's an assumption that sizeof(unsigned long) <= sizeof(pointer) on all architectures and as far as I know that assumption holds. So adding a revision counter to the struct kernfs_elem_dir variant of the kernfs_node type union won't increase the size of the kernfs_node struct. This is because struct kernfs_elem_dir is at least sizeof(pointer) smaller than the largest union variant. It's tempting to make the revision counter a u64 but that would increase the size of kernfs_node on archs where sizeof(pointer) is smaller than the revision counter. Reviewed-by: Miklos Szeredi Signed-off-by: Ian Kent Link: https://lore.kernel.org/r/162642769895.63632.8356662784964509867.stgit@web.messagingengine.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 2 ++ fs/kernfs/kernfs-internal.h | 19 +++++++++++++++++++ include/linux/kernfs.h | 5 +++++ 3 files changed, 26 insertions(+) (limited to 'fs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 33166ec90a11..b3d1bc0f317d 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -372,6 +372,7 @@ static int kernfs_link_sibling(struct kernfs_node *kn) /* successfully added, account subdir number */ if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs++; + kernfs_inc_rev(kn->parent); return 0; } @@ -394,6 +395,7 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; + kernfs_inc_rev(kn->parent); rb_erase(&kn->rb, &kn->parent->dir.children); RB_CLEAR_NODE(&kn->rb); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index ccc3b44f6306..c2ae58f3b202 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -81,6 +81,25 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) return d_inode(dentry)->i_private; } +static inline void kernfs_set_rev(struct kernfs_node *parent, + struct dentry *dentry) +{ + dentry->d_time = parent->dir.rev; +} + +static inline void kernfs_inc_rev(struct kernfs_node *parent) +{ + parent->dir.rev++; +} + +static inline bool kernfs_dir_changed(struct kernfs_node *parent, + struct dentry *dentry) +{ + if (parent->dir.rev != dentry->d_time) + return true; + return false; +} + extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 9e8ca8743c26..d68b4ad09573 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -98,6 +98,11 @@ struct kernfs_elem_dir { * better directly in kernfs_node but is here to save space. */ struct kernfs_root *root; + /* + * Monotonic revision counter, used to identify if a directory + * node has changed during negative dentry revalidation. + */ + unsigned long rev; }; struct kernfs_elem_symlink { -- cgit v1.2.3 From c7e7c04274b13f98f758fb69b03f2ab61976ea80 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 16 Jul 2021 17:28:24 +0800 Subject: kernfs: use VFS negative dentry caching If there are many lookups for non-existent paths these negative lookups can lead to a lot of overhead during path walks. The VFS allows dentries to be created as negative and hashed, and caches them so they can be used to reduce the fairly high overhead alloc/free cycle that occurs during these lookups. Use the kernfs node parent revision to identify if a change has been made to the containing directory so that the negative dentry can be discarded and the lookup redone. Reviewed-by: Miklos Szeredi Signed-off-by: Ian Kent Link: https://lore.kernel.org/r/162642770420.63632.15791924970508867106.stgit@web.messagingengine.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 55 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index b3d1bc0f317d..0b21a8f961ac 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1039,9 +1039,31 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - /* Always perform fresh lookup for negatives */ - if (d_really_is_negative(dentry)) - goto out_bad_unlocked; + /* Negative hashed dentry? */ + if (d_really_is_negative(dentry)) { + struct kernfs_node *parent; + + /* If the kernfs parent node has changed discard and + * proceed to ->lookup. + */ + mutex_lock(&kernfs_mutex); + spin_lock(&dentry->d_lock); + parent = kernfs_dentry_node(dentry->d_parent); + if (parent) { + if (kernfs_dir_changed(parent, dentry)) { + spin_unlock(&dentry->d_lock); + mutex_unlock(&kernfs_mutex); + return 0; + } + } + spin_unlock(&dentry->d_lock); + mutex_unlock(&kernfs_mutex); + + /* The kernfs parent node hasn't changed, leave the + * dentry negative and return success. + */ + return 1; + } kn = kernfs_dentry_node(dentry); mutex_lock(&kernfs_mutex); @@ -1067,7 +1089,6 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) return 1; out_bad: mutex_unlock(&kernfs_mutex); -out_bad_unlocked: return 0; } @@ -1082,33 +1103,27 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *ret; struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; - struct inode *inode; + struct inode *inode = NULL; const void *ns = NULL; mutex_lock(&kernfs_mutex); - if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); - - /* no such entry */ - if (!kn || !kernfs_active(kn)) { - ret = NULL; - goto out_unlock; - } - /* attach dentry and inode */ - inode = kernfs_get_inode(dir->i_sb, kn); - if (!inode) { - ret = ERR_PTR(-ENOMEM); - goto out_unlock; + if (kn && kernfs_active(kn)) { + inode = kernfs_get_inode(dir->i_sb, kn); + if (!inode) + inode = ERR_PTR(-ENOMEM); } - - /* instantiate and hash dentry */ + /* Needed only for negative dentry validation */ + if (!inode) + kernfs_set_rev(parent, dentry); + /* instantiate and hash (possibly negative) dentry */ ret = d_splice_alias(inode, dentry); - out_unlock: mutex_unlock(&kernfs_mutex); + return ret; } -- cgit v1.2.3 From 7ba0273b2f34a55efe967d3c7381fb1da2ca195f Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 16 Jul 2021 17:28:29 +0800 Subject: kernfs: switch kernfs to use an rwsem The kernfs global lock restricts the ability to perform kernfs node lookup operations in parallel during path walks. Change the kernfs mutex to an rwsem so that, when opportunity arises, node searches can be done in parallel with path walk lookups. Reviewed-by: Miklos Szeredi Signed-off-by: Ian Kent Link: https://lore.kernel.org/r/162642770946.63632.2218304587223241374.stgit@web.messagingengine.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 100 ++++++++++++++++++++++---------------------- fs/kernfs/file.c | 4 +- fs/kernfs/inode.c | 16 +++---- fs/kernfs/kernfs-internal.h | 5 ++- fs/kernfs/mount.c | 12 +++--- fs/kernfs/symlink.c | 4 +- include/linux/kernfs.h | 2 +- 7 files changed, 72 insertions(+), 71 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 0b21a8f961ac..4994723d6cf7 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,7 @@ #include "kernfs-internal.h" -DEFINE_MUTEX(kernfs_mutex); +DECLARE_RWSEM(kernfs_rwsem); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ @@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ static bool kernfs_active(struct kernfs_node *kn) { - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held(&kernfs_rwsem); return atomic_read(&kn->active) >= 0; } @@ -340,7 +340,7 @@ static int kernfs_sd_compare(const struct kernfs_node *left, * @kn->parent->dir.children. * * Locking: - * mutex_lock(kernfs_mutex) + * kernfs_rwsem held exclusive * * RETURNS: * 0 on susccess -EEXIST on failure. @@ -386,7 +386,7 @@ static int kernfs_link_sibling(struct kernfs_node *kn) * removed, %false if @kn wasn't on the rbtree. * * Locking: - * mutex_lock(kernfs_mutex) + * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { @@ -457,14 +457,14 @@ void kernfs_put_active(struct kernfs_node *kn) * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) - __releases(&kernfs_mutex) __acquires(&kernfs_mutex) + __releases(&kernfs_rwsem) __acquires(&kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); @@ -483,7 +483,7 @@ static void kernfs_drain(struct kernfs_node *kn) kernfs_drain_open_files(kn); - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); } /** @@ -722,7 +722,7 @@ int kernfs_add_one(struct kernfs_node *kn) bool has_ns; int ret; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); @@ -753,7 +753,7 @@ int kernfs_add_one(struct kernfs_node *kn) ps_iattr->ia_mtime = ps_iattr->ia_ctime; } - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. @@ -767,7 +767,7 @@ int kernfs_add_one(struct kernfs_node *kn) return 0; out_unlock: - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return ret; } @@ -788,7 +788,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held(&kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", @@ -820,7 +820,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, size_t len; char *p, *name; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_read(&kernfs_rwsem); /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */ spin_lock_irq(&kernfs_rename_lock); @@ -860,10 +860,10 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, { struct kernfs_node *kn; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return kn; } @@ -884,10 +884,10 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, { struct kernfs_node *kn; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return kn; } @@ -1046,18 +1046,18 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) /* If the kernfs parent node has changed discard and * proceed to ->lookup. */ - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); spin_lock(&dentry->d_lock); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { if (kernfs_dir_changed(parent, dentry)) { spin_unlock(&dentry->d_lock); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return 0; } } spin_unlock(&dentry->d_lock); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. @@ -1066,7 +1066,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) } kn = kernfs_dentry_node(dentry); - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) @@ -1085,10 +1085,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return 1; out_bad: - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return 0; } @@ -1106,7 +1106,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, struct inode *inode = NULL; const void *ns = NULL; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; @@ -1122,7 +1122,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, kernfs_set_rev(parent, dentry); /* instantiate and hash (possibly negative) dentry */ ret = d_splice_alias(inode, dentry); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return ret; } @@ -1244,7 +1244,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, { struct rb_node *rbn; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) @@ -1280,7 +1280,7 @@ void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { @@ -1294,14 +1294,14 @@ void kernfs_activate(struct kernfs_node *kn) pos->flags |= KERNFS_ACTIVATED; } - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&kernfs_rwsem); /* * Short-circuit if non-root @kn has already finished removal. @@ -1324,7 +1324,7 @@ static void __kernfs_remove(struct kernfs_node *kn) pos = kernfs_leftmost_descendant(kn); /* - * kernfs_drain() drops kernfs_mutex temporarily and @pos's + * kernfs_drain() drops kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. @@ -1371,9 +1371,9 @@ static void __kernfs_remove(struct kernfs_node *kn) */ void kernfs_remove(struct kernfs_node *kn) { - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); __kernfs_remove(kn); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); } /** @@ -1460,17 +1460,17 @@ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored - * while holding kernfs_mutex. The ones which lost arbitration - * waits for SUICDED && drained which can happen only after the - * enclosing kernfs operation which executed the winning instance - * of kernfs_remove_self() finished. + * while kernfs_rwsem for held exclusive. The ones which lost + * arbitration waits for SUICIDED && drained which can happen only + * after the enclosing kernfs operation which executed the winning + * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; @@ -1488,9 +1488,9 @@ bool kernfs_remove_self(struct kernfs_node *kn) atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); schedule(); - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); @@ -1498,12 +1498,12 @@ bool kernfs_remove_self(struct kernfs_node *kn) } /* - * This must be done while holding kernfs_mutex; otherwise, waiting - * for SUICIDED && deactivated could finish prematurely. + * This must be done while kernfs_rwsem held exclusive; otherwise, + * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return ret; } @@ -1527,13 +1527,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, return -ENOENT; } - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) __kernfs_remove(kn); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); if (kn) return 0; @@ -1559,7 +1559,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, if (!kn->parent) return -EINVAL; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || @@ -1613,7 +1613,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, error = 0; out: - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return error; } @@ -1688,7 +1688,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; @@ -1705,12 +1705,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) file->private_data = pos; kernfs_get(pos); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); if (!dir_emit(ctx, name, len, ino, type)) return 0; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); } - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index c75719312147..60e2a86c535e 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -860,7 +860,7 @@ repeat: spin_unlock_irq(&kernfs_notify_lock); /* kick fsnotify */ - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; @@ -898,7 +898,7 @@ repeat: iput(inode); } - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); kernfs_put(kn); goto repeat; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 26f2aa3586f9..dad749f39518 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -100,9 +100,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { int ret; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); ret = __kernfs_setattr(kn, iattr); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return ret; } @@ -116,7 +116,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!kn) return -EINVAL; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) goto out; @@ -129,7 +129,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, setattr_copy(&init_user_ns, inode, iattr); out: - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return error; } @@ -185,9 +185,9 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); kernfs_refresh_inode(kn, inode); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); generic_fillattr(&init_user_ns, inode, stat); return 0; @@ -278,9 +278,9 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, kn = inode->i_private; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); kernfs_refresh_inode(kn, inode); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return generic_permission(&init_user_ns, inode, mask); } diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index c2ae58f3b202..f9cc912c31e1 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -69,7 +70,7 @@ struct kernfs_super_info { */ const void *ns; - /* anchored at kernfs_root->supers, protected by kernfs_mutex */ + /* anchored at kernfs_root->supers, protected by kernfs_rwsem */ struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) @@ -121,7 +122,7 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ -extern struct mutex kernfs_mutex; +extern struct rw_semaphore kernfs_rwsem; extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 9dc7e7a64e10..baa4155ba2ed 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -255,9 +255,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_shrink.seeks = 0; /* get root inode, initialize and unlock it */ - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; @@ -344,9 +344,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); list_add(&info->node, &info->root->supers); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); } fc->root = dget(sb->s_root); @@ -372,9 +372,9 @@ void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); list_del(&info->node); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); /* * Remove the superblock from fs_supers/s_instances diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 5432883d819f..c8f8e41b8411 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -116,9 +116,9 @@ static int kernfs_getlink(struct inode *inode, char *path) struct kernfs_node *target = kn->symlink.target_kn; int error; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); error = kernfs_get_target_path(parent, target, path); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return error; } diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index d68b4ad09573..1093abf7c28c 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -193,7 +193,7 @@ struct kernfs_root { u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; - /* list of kernfs_super_info of this root, protected by kernfs_mutex */ + /* list of kernfs_super_info of this root, protected by kernfs_rwsem */ struct list_head supers; wait_queue_head_t deactivate_waitq; -- cgit v1.2.3 From 47b5c64d0ab5e7136db2b78c6ec710e0d8a5a36b Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 16 Jul 2021 17:28:34 +0800 Subject: kernfs: use i_lock to protect concurrent inode updates The inode operations .permission() and .getattr() use the kernfs node write lock but all that's needed is the read lock to protect against partial updates of these kernfs node fields which are all done under the write lock. And .permission() is called frequently during path walks and can cause quite a bit of contention between kernfs node operations and path walks when the number of concurrent walks is high. To change kernfs_iop_getattr() and kernfs_iop_permission() to take the rw sem read lock instead of the write lock an additional lock is needed to protect against multiple processes concurrently updating the inode attributes and link count in kernfs_refresh_inode(). The inode i_lock seems like the sensible thing to use to protect these inode attribute updates so use it in kernfs_refresh_inode(). The last hunk in the patch, applied to kernfs_fill_super(), is possibly not needed but taking the lock was present originally. I prefer to continue to take it to protect against a partial update of the source kernfs fields during the call to kernfs_refresh_inode() made by kernfs_get_inode(). Reviewed-by: Miklos Szeredi Signed-off-by: Ian Kent Link: https://lore.kernel.org/r/162642771474.63632.16295959115893904470.stgit@web.messagingengine.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/inode.c | 18 ++++++++++++------ fs/kernfs/mount.c | 4 ++-- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index dad749f39518..c0eae1725435 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -185,11 +185,13 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; - down_write(&kernfs_rwsem); + down_read(&kernfs_rwsem); + spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); - up_write(&kernfs_rwsem); - generic_fillattr(&init_user_ns, inode, stat); + spin_unlock(&inode->i_lock); + up_read(&kernfs_rwsem); + return 0; } @@ -272,17 +274,21 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct kernfs_node *kn; + int ret; if (mask & MAY_NOT_BLOCK) return -ECHILD; kn = inode->i_private; - down_write(&kernfs_rwsem); + down_read(&kernfs_rwsem); + spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); - up_write(&kernfs_rwsem); + ret = generic_permission(&init_user_ns, inode, mask); + spin_unlock(&inode->i_lock); + up_read(&kernfs_rwsem); - return generic_permission(&init_user_ns, inode, mask); + return ret; } int kernfs_xattr_get(struct kernfs_node *kn, const char *name, diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index baa4155ba2ed..f2f909d09f52 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -255,9 +255,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_shrink.seeks = 0; /* get root inode, initialize and unlock it */ - down_write(&kernfs_rwsem); + down_read(&kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); - up_write(&kernfs_rwsem); + up_read(&kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; -- cgit v1.2.3 From df6192f47d2311cf40cd4321cc59863a5853b665 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 16 Jul 2021 17:28:40 +0800 Subject: kernfs: dont call d_splice_alias() under kernfs node lock The call to d_splice_alias() in kernfs_iop_lookup() doesn't depend on any kernfs node so there's no reason to hold the kernfs node lock when calling it. Reviewed-by: Miklos Szeredi Signed-off-by: Ian Kent Link: https://lore.kernel.org/r/162642772000.63632.10672683419693513226.stgit@web.messagingengine.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 4994723d6cf7..ba581429bf7b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1100,7 +1100,6 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct dentry *ret; struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct inode *inode = NULL; @@ -1120,11 +1119,10 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, /* Needed only for negative dentry validation */ if (!inode) kernfs_set_rev(parent, dentry); - /* instantiate and hash (possibly negative) dentry */ - ret = d_splice_alias(inode, dentry); up_read(&kernfs_rwsem); - return ret; + /* instantiate and hash (possibly negative) dentry */ + return d_splice_alias(inode, dentry); } static int kernfs_iop_mkdir(struct user_namespace *mnt_userns, -- cgit v1.2.3 From 112cedc8e600b668688eb809bf11817adec58ddc Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 2 Aug 2021 18:24:44 +0200 Subject: debugfs: Return error during {full/open}_proxy_open() on rmmod MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a kernel module gets unloaded then it printed report about a leak before commit 275678e7a9be ("debugfs: Check module state before warning in {full/open}_proxy_open()"). An additional check was added in this commit to avoid this printing. But it was forgotten that the function must return an error in this case because it was not actually opened. As result, the systems started to crash or to hang when a module was unloaded while something was trying to open a file. Fixes: 275678e7a9be ("debugfs: Check module state before warning in {full/open}_proxy_open()") Cc: Taehee Yoo Reported-by: Mário Lopes Signed-off-by: Sven Eckelmann Link: https://lore.kernel.org/r/20210802162444.7848-1-sven@narfation.org Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index df00231d3ecc..7d162b0efbf0 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -179,8 +179,10 @@ static int open_proxy_open(struct inode *inode, struct file *filp) if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && - real_fops->owner->state == MODULE_STATE_GOING) + real_fops->owner->state == MODULE_STATE_GOING) { + r = -ENXIO; goto out; + } #endif /* Huh? Module did not clean up after itself at exit? */ @@ -314,8 +316,10 @@ static int full_proxy_open(struct inode *inode, struct file *filp) if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && - real_fops->owner->state == MODULE_STATE_GOING) + real_fops->owner->state == MODULE_STATE_GOING) { + r = -ENXIO; goto out; + } #endif /* Huh? Module did not cleanup after itself at exit? */ -- cgit v1.2.3 From 93bb8e352a9136a56dd26762bf54cf6554cfa96c Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczyński Date: Thu, 29 Jul 2021 23:32:34 +0000 Subject: sysfs: Invoke iomem_get_mapping() from the sysfs open callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defer invocation of the iomem_get_mapping() to the sysfs open callback so that it can be executed as needed when the binary sysfs object has been accessed. To do that, convert the "mapping" member of the struct bin_attribute from a pointer to the struct address_space into a function pointer with a signature that requires the same return type, and then updates the sysfs_kf_bin_open() to invoke provided function should the function pointer be valid. Also, convert every invocation of iomem_get_mapping() into a function pointer assignment, therefore allowing for the iomem_get_mapping() invocation to be deferred to when the sysfs open callback runs. Thus, this change removes the need for the fs_initcalls to complete before any other sub-system that uses the iomem_get_mapping() would be able to invoke it safely without leading to a failure and an Oops related to an invalid iomem_get_mapping() access. Suggested-by: Dan Williams Reviewed-by: Dan Williams Acked-by: Bjorn Helgaas Signed-off-by: Dan Williams Signed-off-by: Krzysztof Wilczyński Link: https://lore.kernel.org/r/20210729233235.1508920-2-kw@linux.com Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-sysfs.c | 6 +++--- fs/sysfs/file.c | 2 +- include/linux/sysfs.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 5d63df7c1820..76e5545d0e73 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -965,7 +965,7 @@ void pci_create_legacy_files(struct pci_bus *b) b->legacy_io->read = pci_read_legacy_io; b->legacy_io->write = pci_write_legacy_io; b->legacy_io->mmap = pci_mmap_legacy_io; - b->legacy_io->mapping = iomem_get_mapping(); + b->legacy_io->mapping = iomem_get_mapping; pci_adjust_legacy_attr(b, pci_mmap_io); error = device_create_bin_file(&b->dev, b->legacy_io); if (error) @@ -978,7 +978,7 @@ void pci_create_legacy_files(struct pci_bus *b) b->legacy_mem->size = 1024*1024; b->legacy_mem->attr.mode = 0600; b->legacy_mem->mmap = pci_mmap_legacy_mem; - b->legacy_io->mapping = iomem_get_mapping(); + b->legacy_io->mapping = iomem_get_mapping; pci_adjust_legacy_attr(b, pci_mmap_mem); error = device_create_bin_file(&b->dev, b->legacy_mem); if (error) @@ -1195,7 +1195,7 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) } } if (res_attr->mmap) - res_attr->mapping = iomem_get_mapping(); + res_attr->mapping = iomem_get_mapping; res_attr->attr.name = res_attr_name; res_attr->attr.mode = 0600; res_attr->size = pci_resource_len(pdev, num); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 9aefa7779b29..a3ee4c32a264 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -175,7 +175,7 @@ static int sysfs_kf_bin_open(struct kernfs_open_file *of) struct bin_attribute *battr = of->kn->priv; if (battr->mapping) - of->file->f_mapping = battr->mapping; + of->file->f_mapping = battr->mapping(); return 0; } diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index a12556a4b93a..d5bcc897583c 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -176,7 +176,7 @@ struct bin_attribute { struct attribute attr; size_t size; void *private; - struct address_space *mapping; + struct address_space *(*mapping)(void); ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, -- cgit v1.2.3 From f06aff924f975881a6abf91d2af0078fc8cd37bf Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczyński Date: Thu, 29 Jul 2021 23:32:35 +0000 Subject: sysfs: Rename struct bin_attribute member to f_mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are two users of iomem_get_mapping(), the struct file and struct bin_attribute. The former has a member called "f_mapping" and the latter has a member called "mapping", and both are poniters to struct address_space. Rename struct bin_attribute member to "f_mapping" to keep both meaning and the usage consistent with other users of iomem_get_mapping(). Acked-by: Bjorn Helgaas Signed-off-by: Krzysztof Wilczyński Link: https://lore.kernel.org/r/20210729233235.1508920-3-kw@linux.com Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-sysfs.c | 6 +++--- fs/sysfs/file.c | 4 ++-- include/linux/sysfs.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 76e5545d0e73..f65382915f01 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -965,7 +965,7 @@ void pci_create_legacy_files(struct pci_bus *b) b->legacy_io->read = pci_read_legacy_io; b->legacy_io->write = pci_write_legacy_io; b->legacy_io->mmap = pci_mmap_legacy_io; - b->legacy_io->mapping = iomem_get_mapping; + b->legacy_io->f_mapping = iomem_get_mapping; pci_adjust_legacy_attr(b, pci_mmap_io); error = device_create_bin_file(&b->dev, b->legacy_io); if (error) @@ -978,7 +978,7 @@ void pci_create_legacy_files(struct pci_bus *b) b->legacy_mem->size = 1024*1024; b->legacy_mem->attr.mode = 0600; b->legacy_mem->mmap = pci_mmap_legacy_mem; - b->legacy_io->mapping = iomem_get_mapping; + b->legacy_io->f_mapping = iomem_get_mapping; pci_adjust_legacy_attr(b, pci_mmap_mem); error = device_create_bin_file(&b->dev, b->legacy_mem); if (error) @@ -1195,7 +1195,7 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) } } if (res_attr->mmap) - res_attr->mapping = iomem_get_mapping; + res_attr->f_mapping = iomem_get_mapping; res_attr->attr.name = res_attr_name; res_attr->attr.mode = 0600; res_attr->size = pci_resource_len(pdev, num); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index a3ee4c32a264..d019d6ac6ad0 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -174,8 +174,8 @@ static int sysfs_kf_bin_open(struct kernfs_open_file *of) { struct bin_attribute *battr = of->kn->priv; - if (battr->mapping) - of->file->f_mapping = battr->mapping(); + if (battr->f_mapping) + of->file->f_mapping = battr->f_mapping(); return 0; } diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index d5bcc897583c..e3f1e8ac1f85 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -176,7 +176,7 @@ struct bin_attribute { struct attribute attr; size_t size; void *private; - struct address_space *(*mapping)(void); + struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, -- cgit v1.2.3