From 6a07814ff643b5c8e1353d8c6229f52fde205cde Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 5 Jun 2026 15:53:16 +0200 Subject: kernfs: fix xattr race condition with multiple superblocks Multiple superblocks with different namespaces can share the same kernfs_node when kernfs_test_super() finds a matching root but different namespace. This means multiple inodes from different superblocks can reference the same kernfs_node->iattr->xattrs structure. The VFS layer only holds per-inode locks during xattr operations, which is insufficient to serialize concurrent xattr modifications on the shared kernfs_node. This can lead to race conditions in simple_xattr_set() where the lookup->replace/remove sequence is not atomic with respect to operations from other superblocks. Fix this by protecting xattr operations with the existing hashed kernfs_locks->open_file_mutex[] array, which is already used to protect per-node open file data. The hashed mutex array provides scalable per-node serialization (scaled by CPU count, up to 1024 locks on 32+ CPU systems) with zero memory overhead. Changes: - Rename open_file_mutex[] to node_mutex[] to reflect dual purpose - Add kernfs_node_lock_ptr() and kernfs_node_lock() helpers - Protect simple_xattr_set() calls in kernfs_xattr_set() and kernfs_vfs_user_xattr_set() with the hashed mutex - Update file.c to use new helpers via compatibility wrappers - Update documentation to explain the extended lock usage Fixes: b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure") Reported-by: Sashiko Closes: https://sashiko.dev/#/patchset/20260601162454.2116375-1-mszeredi%40redhat.com Assisted-by: Claude:claude-sonnet-4-5 Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260605135322.2632068-2-mszeredi@redhat.com Signed-off-by: Christian Brauner (Amutable) --- include/linux/kernfs.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index e21b2f7f4159..351a5101c862 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -76,20 +76,25 @@ struct kernfs_iattrs; * kernfs_open_file. * * kernfs_open_files are chained at kernfs_open_node->files, which is - * protected by kernfs_global_locks.open_file_mutex[i]. + * protected by kernfs_global_locks.node_mutex[i]. * * To reduce possible contention in sysfs access, arising due to single - * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node + * locks, use an array of locks (e.g. node_mutex) and use kernfs_node * object address as hash keys to get the index of these locks. * * Hashed mutexes are safe to use here because operations using these don't * rely on global exclusion. * + * The hashed mutex array protects per-node data: the kernfs_open_node for + * open file management, and kernfs_node xattr operations (necessary because + * multiple superblocks with different namespaces can share the same + * kernfs_node, making per-inode locking insufficient). + * * In future we intend to replace other global locks with hashed ones as well. * kernfs_global_locks acts as a holder for all such hash tables. */ struct kernfs_global_locks { - struct mutex open_file_mutex[NR_KERNFS_LOCKS]; + struct mutex node_mutex[NR_KERNFS_LOCKS]; }; enum kernfs_node_type { -- cgit v1.2.3 From 076e5cef28e27febfc09b5f72544d2b857c75201 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 5 Jun 2026 15:53:18 +0200 Subject: simple_xattr: change interface to pass struct simple_xattrs ** Change the simple_xattr API to accept pointer-to-pointer (struct simple_xattrs **) instead of pointer. This allows the functions to handle lazy allocation internally without requiring callers to use simple_xattrs_lazy_alloc(). The simple_xattr_set(), simple_xattr_set_limited() and simple_xattr_add() functions now handle allocation when xattrs is NULL. simple_xattrs_free() now also frees the xattrs structure itself and sets the pointer to NULL. This simplifies callers and removes the need for most callers to explicitly manage xattrs allocation and lifetime. In shmem_initxattrs(), the total required space for all initial xattrs (ispace) is pre-calculated and deducted from sbinfo->free_ispace. Since this patch modifies the function to add new xattrs directly to the inode's &info->xattrs list rather than using a local temporary variable, a failure means that the partially populated info->xattrs list remains attached to the inode. When the VFS caller handles the -ENOMEM error, it drops the newly created inode via iput(), shmem_free_inode() adds freed to sbinfo->free_ispace a second time, permanently inflating the tmpfs free space quota. Fix by substracting already added xattrs from ispace. Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260605135322.2632068-4-mszeredi@redhat.com Signed-off-by: Christian Brauner (Amutable) --- fs/kernfs/dir.c | 12 +++--------- fs/kernfs/inode.c | 24 ++++-------------------- fs/pidfs.c | 37 ++++++------------------------------- fs/xattr.c | 46 +++++++++++++++++++++++++++++++++++++--------- include/linux/xattr.h | 21 ++++++--------------- mm/shmem.c | 34 ++++++++++------------------------ net/socket.c | 24 +++++------------------- 7 files changed, 71 insertions(+), 127 deletions(-) (limited to 'include/linux') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 4f9ade82b08a..368dc4a217d9 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -605,11 +605,8 @@ void kernfs_put(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); - if (kn->iattr && kn->iattr->xattrs) { - simple_xattrs_free(kn->iattr->xattrs, NULL); - kfree(kn->iattr->xattrs); - kn->iattr->xattrs = NULL; - } + if (kn->iattr) + simple_xattrs_free(&kn->iattr->xattrs, NULL); spin_lock(&root->kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); @@ -709,10 +706,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, err_out4: if (kn->iattr) { - if (kn->iattr->xattrs) { - simple_xattrs_free(kn->iattr->xattrs, NULL); - kfree(kn->iattr->xattrs); - } + simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } err_out3: diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index e676737d9531..f2298de6bc6f 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -144,8 +144,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) if (!attrs) return -ENOMEM; - return simple_xattr_list(d_inode(dentry), READ_ONCE(attrs->xattrs), - buf, size); + return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } static inline void set_default_inode_attr(struct inode *inode, umode_t mode) @@ -297,23 +296,17 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn); - struct simple_xattrs *xattrs; if (!attrs) return -ENODATA; - xattrs = READ_ONCE(attrs->xattrs); - if (!xattrs) - return -ENODATA; - - return simple_xattr_get(xattrs, name, value, size); + return simple_xattr_get(&attrs->xattrs, name, value, size); } int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *old_xattr; - struct simple_xattrs *xattrs; struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); @@ -329,11 +322,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name, */ CLASS(kernfs_node_lock, lock)(kn); - xattrs = simple_xattrs_lazy_alloc(&attrs->xattrs, value, flags); - if (IS_ERR_OR_NULL(xattrs)) - return PTR_ERR(xattrs); - - old_xattr = simple_xattr_set(xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags); if (IS_ERR(old_xattr)) return PTR_ERR(old_xattr); @@ -371,7 +360,6 @@ static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, { const char *full_name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; - struct simple_xattrs *xattrs; struct kernfs_iattrs *attrs; if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR)) @@ -384,11 +372,7 @@ static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, /* See comment in kernfs_xattr_set() about locking. */ CLASS(kernfs_node_lock, lock)(kn); - xattrs = simple_xattrs_lazy_alloc(&attrs->xattrs, value, flags); - if (IS_ERR_OR_NULL(xattrs)) - return PTR_ERR(xattrs); - - return simple_xattr_set_limited(xattrs, &attrs->xattr_limits, + return simple_xattr_set_limited(&attrs->xattrs, &attrs->xattr_limits, full_name, value, size, flags); } diff --git a/fs/pidfs.c b/fs/pidfs.c index 1cce4f34a051..eb5105bddeca 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -196,12 +196,7 @@ static void pidfs_free_attr_work(struct work_struct *work) head = llist_del_all(&pidfs_free_list); llist_for_each_entry_safe(attr, next, head, pidfs_llist) { - struct simple_xattrs *xattrs = attr->xattrs; - - if (xattrs) { - simple_xattrs_free(xattrs, NULL); - kfree(xattrs); - } + simple_xattrs_free(&attr->xattrs, NULL); kfree(attr); } } @@ -815,14 +810,8 @@ static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size) { struct inode *inode = d_inode(dentry); struct pid *pid = inode->i_private; - struct pidfs_attr *attr = pid->attr; - struct simple_xattrs *xattrs; - - xattrs = READ_ONCE(attr->xattrs); - if (!xattrs) - return 0; - return simple_xattr_list(inode, xattrs, buf, size); + return simple_xattr_list(inode, &pid->attr->xattrs, buf, size); } static const struct inode_operations pidfs_inode_operations = { @@ -1057,16 +1046,9 @@ static int pidfs_xattr_get(const struct xattr_handler *handler, const char *suffix, void *value, size_t size) { struct pid *pid = inode->i_private; - struct pidfs_attr *attr = pid->attr; - const char *name; - struct simple_xattrs *xattrs; - - xattrs = READ_ONCE(attr->xattrs); - if (!xattrs) - return -ENODATA; + const char *name = xattr_full_name(handler, suffix); - name = xattr_full_name(handler, suffix); - return simple_xattr_get(xattrs, name, value, size); + return simple_xattr_get(&pid->attr->xattrs, name, value, size); } static int pidfs_xattr_set(const struct xattr_handler *handler, @@ -1075,20 +1057,13 @@ static int pidfs_xattr_set(const struct xattr_handler *handler, const void *value, size_t size, int flags) { struct pid *pid = inode->i_private; - struct pidfs_attr *attr = pid->attr; - const char *name; - struct simple_xattrs *xattrs; + const char *name = xattr_full_name(handler, suffix); struct simple_xattr *old_xattr; /* Ensure we're the only one to set @attr->xattrs. */ WARN_ON_ONCE(!inode_is_locked(inode)); - xattrs = simple_xattrs_lazy_alloc(&attr->xattrs, value, flags); - if (IS_ERR_OR_NULL(xattrs)) - return PTR_ERR(xattrs); - - name = xattr_full_name(handler, suffix); - old_xattr = simple_xattr_set(xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(&pid->attr->xattrs, name, value, size, flags); if (IS_ERR(old_xattr)) return PTR_ERR(old_xattr); diff --git a/fs/xattr.c b/fs/xattr.c index 09ecbaaa1660..9ef7ad8a8f32 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1311,12 +1311,17 @@ static const struct rhashtable_params simple_xattr_params = { * Return: On success the length of the xattr value is returned. On error a * negative error code is returned. */ -int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, +int simple_xattr_get(struct simple_xattrs **xattrsp, const char *name, void *buffer, size_t size) { + struct simple_xattrs *xattrs; struct simple_xattr *xattr; int ret = -ENODATA; + xattrs = READ_ONCE(*xattrsp); + if (!xattrs) + return -ENODATA; + guard(rcu)(); xattr = rhashtable_lookup(&xattrs->ht, name, simple_xattr_params); if (xattr) { @@ -1331,6 +1336,9 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, return ret; } +static struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, + const void *value, int flags); + /** * simple_xattr_set - set an xattr object * @xattrs: the header of the xattr object @@ -1362,13 +1370,18 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * Return: On success, the removed or replaced xattr is returned, to be freed * by the caller; or NULL if none. On failure a negative error code is returned. */ -struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, +struct simple_xattr *simple_xattr_set(struct simple_xattrs **xattrsp, const char *name, const void *value, size_t size, int flags) { + struct simple_xattrs *xattrs; struct simple_xattr *old_xattr = NULL; int err; + xattrs = simple_xattrs_lazy_alloc(xattrsp, value, flags); + if (IS_ERR_OR_NULL(xattrs)) + return ERR_CAST(xattrs); + CLASS(simple_xattr, new_xattr)(value, size); if (IS_ERR(new_xattr)) return new_xattr; @@ -1467,7 +1480,7 @@ static inline int simple_xattr_limits_inc(struct simple_xattr_limits *limits, * Return: On success zero is returned. On failure a negative error code is * returned. */ -int simple_xattr_set_limited(struct simple_xattrs *xattrs, +int simple_xattr_set_limited(struct simple_xattrs **xattrs, struct simple_xattr_limits *limits, const char *name, const void *value, size_t size, int flags) @@ -1527,10 +1540,11 @@ static bool xattr_is_maclabel(const char *name) * Return: On success the required size or the size of the copied xattrs is * returned. On error a negative error code is returned. */ -ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, +ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs **xattrsp, char *buffer, size_t size) { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); + struct simple_xattrs *xattrs; struct rhashtable_iter iter; struct simple_xattr *xattr; ssize_t remaining_size = size; @@ -1552,6 +1566,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, remaining_size -= err; err = 0; + xattrs = READ_ONCE(*xattrsp); if (!xattrs) return size - remaining_size; @@ -1597,9 +1612,15 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, * Return: On success zero is returned. On failure a negative error code is * returned. */ -int simple_xattr_add(struct simple_xattrs *xattrs, +int simple_xattr_add(struct simple_xattrs **xattrsp, struct simple_xattr *new_xattr) { + struct simple_xattrs *xattrs; + + xattrs = simple_xattrs_lazy_alloc(xattrsp, new_xattr->value, 0); + if (IS_ERR(xattrs)) + return PTR_ERR(xattrs); + return rhashtable_insert_fast(&xattrs->ht, &new_xattr->hash_node, simple_xattr_params); } @@ -1629,7 +1650,7 @@ int simple_xattrs_init(struct simple_xattrs *xattrs) * Return: On success a new simple_xattrs is returned. On failure an * ERR_PTR is returned. */ -struct simple_xattrs *simple_xattrs_alloc(void) +static struct simple_xattrs *simple_xattrs_alloc(void) { struct simple_xattrs *xattrs __free(kfree) = NULL; int ret; @@ -1661,8 +1682,8 @@ struct simple_xattrs *simple_xattrs_alloc(void) * check with IS_ERR_OR_NULL() and propagate with PTR_ERR() which * correctly returns 0 for the NULL no-op case. */ -struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, - const void *value, int flags) +static struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, + const void *value, int flags) { struct simple_xattrs *xattrs; @@ -1697,12 +1718,19 @@ static void simple_xattr_ht_free(void *ptr, void *arg) * Destroy all xattrs in @xattr. When this is called no one can hold a * reference to any of the xattrs anymore. */ -void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) +void simple_xattrs_free(struct simple_xattrs **xattrsp, size_t *freed_space) { + struct simple_xattrs *xattrs = *xattrsp; + might_sleep(); + if (!xattrs) + return; + if (freed_space) *freed_space = 0; rhashtable_free_and_destroy(&xattrs->ht, simple_xattr_ht_free, freed_space); + kfree(xattrs); + *xattrsp = NULL; } diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 8b6601367eae..ded446c1ef81 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -133,26 +133,23 @@ static inline void simple_xattr_limits_init(struct simple_xattr_limits *limits) } int simple_xattrs_init(struct simple_xattrs *xattrs); -struct simple_xattrs *simple_xattrs_alloc(void); -struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, - const void *value, int flags); -void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); +void simple_xattrs_free(struct simple_xattrs **xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); void simple_xattr_free_rcu(struct simple_xattr *xattr); -int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, +int simple_xattr_get(struct simple_xattrs **xattrs, const char *name, void *buffer, size_t size); -struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, +struct simple_xattr *simple_xattr_set(struct simple_xattrs **xattrs, const char *name, const void *value, size_t size, int flags); -int simple_xattr_set_limited(struct simple_xattrs *xattrs, +int simple_xattr_set_limited(struct simple_xattrs **xattrs, struct simple_xattr_limits *limits, const char *name, const void *value, size_t size, int flags); -ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, +ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs **xattrs, char *buffer, size_t size); -int simple_xattr_add(struct simple_xattrs *xattrs, +int simple_xattr_add(struct simple_xattrs **xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); @@ -162,10 +159,4 @@ DEFINE_CLASS(simple_xattr, simple_xattr_alloc(value, size), const void *value, size_t size) -DEFINE_CLASS(simple_xattrs, - struct simple_xattrs *, - if (!IS_ERR_OR_NULL(_T)) { simple_xattrs_free(_T, NULL); kfree(_T); }, - simple_xattrs_alloc(), - void) - #endif /* _LINUX_XATTR_H */ diff --git a/mm/shmem.c b/mm/shmem.c index c7897570fc9f..cf4ee9f41191 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1425,10 +1425,8 @@ static void shmem_evict_inode(struct inode *inode) } } - if (info->xattrs) { - simple_xattrs_free(info->xattrs, sbinfo->max_inodes ? &freed : NULL); - kfree(info->xattrs); - } + simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); + shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); clear_inode(inode); @@ -4233,10 +4231,6 @@ static int shmem_initxattrs(struct inode *inode, const struct xattr *xattr; size_t ispace = 0; - CLASS(simple_xattrs, xattrs)(); - if (IS_ERR(xattrs)) - return PTR_ERR(xattrs); - if (sbinfo->max_inodes) { for (xattr = xattr_array; xattr->name != NULL; xattr++) { ispace += simple_xattr_space(xattr->name, @@ -4264,8 +4258,11 @@ static int shmem_initxattrs(struct inode *inode, if (!new_xattr->name) break; - if (simple_xattr_add(xattrs, new_xattr)) + if (simple_xattr_add(&info->xattrs, new_xattr)) break; + + if (sbinfo->max_inodes) + ispace -= simple_xattr_space(new_xattr->name, new_xattr->size); retain_and_null_ptr(new_xattr); } @@ -4277,8 +4274,8 @@ static int shmem_initxattrs(struct inode *inode, } return -ENOMEM; } + WARN_ON(ispace); - smp_store_release(&info->xattrs, no_free_ptr(xattrs)); return 0; } @@ -4287,14 +4284,9 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, const char *name, void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(inode); - struct simple_xattrs *xattrs; - - xattrs = READ_ONCE(info->xattrs); - if (!xattrs) - return -ENODATA; name = xattr_full_name(handler, name); - return simple_xattr_get(xattrs, name, buffer, size); + return simple_xattr_get(&info->xattrs, name, buffer, size); } static int shmem_xattr_handler_set(const struct xattr_handler *handler, @@ -4305,16 +4297,11 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - struct simple_xattrs *xattrs; struct simple_xattr *old_xattr; size_t ispace = 0; name = xattr_full_name(handler, name); - xattrs = simple_xattrs_lazy_alloc(&info->xattrs, value, flags); - if (IS_ERR_OR_NULL(xattrs)) - return PTR_ERR(xattrs); - if (value && sbinfo->max_inodes) { ispace = simple_xattr_space(name, size); raw_spin_lock(&sbinfo->stat_lock); @@ -4327,7 +4314,7 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, return -ENOSPC; } - old_xattr = simple_xattr_set(xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { ispace = 0; if (old_xattr && sbinfo->max_inodes) @@ -4375,8 +4362,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - return simple_xattr_list(d_inode(dentry), READ_ONCE(info->xattrs), - buffer, size); + return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ diff --git a/net/socket.c b/net/socket.c index 22a412fdec07..d3597c858345 100644 --- a/net/socket.c +++ b/net/socket.c @@ -347,12 +347,8 @@ static struct inode *sock_alloc_inode(struct super_block *sb) static void sock_evict_inode(struct inode *inode) { struct sockfs_inode *si = SOCKFS_I(inode); - struct simple_xattrs *xattrs = si->xattrs; - if (xattrs) { - simple_xattrs_free(xattrs, NULL); - kfree(xattrs); - } + simple_xattrs_free(&si->xattrs, NULL); clear_inode(inode); } @@ -443,13 +439,9 @@ static int sockfs_user_xattr_get(const struct xattr_handler *handler, const char *suffix, void *value, size_t size) { const char *name = xattr_full_name(handler, suffix); - struct simple_xattrs *xattrs; - - xattrs = READ_ONCE(SOCKFS_I(inode)->xattrs); - if (!xattrs) - return -ENODATA; + struct sockfs_inode *si = SOCKFS_I(inode); - return simple_xattr_get(xattrs, name, value, size); + return simple_xattr_get(&si->xattrs, name, value, size); } static int sockfs_user_xattr_set(const struct xattr_handler *handler, @@ -460,13 +452,8 @@ static int sockfs_user_xattr_set(const struct xattr_handler *handler, { const char *name = xattr_full_name(handler, suffix); struct sockfs_inode *si = SOCKFS_I(inode); - struct simple_xattrs *xattrs; - - xattrs = simple_xattrs_lazy_alloc(&si->xattrs, value, flags); - if (IS_ERR_OR_NULL(xattrs)) - return PTR_ERR(xattrs); - return simple_xattr_set_limited(xattrs, &si->xattr_limits, + return simple_xattr_set_limited(&si->xattrs, &si->xattr_limits, name, value, size, flags); } @@ -635,8 +622,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, struct sockfs_inode *si = SOCKFS_I(d_inode(dentry)); ssize_t len, used; - len = simple_xattr_list(d_inode(dentry), READ_ONCE(si->xattrs), - buffer, size); + len = simple_xattr_list(d_inode(dentry), &si->xattrs, buffer, size); if (len < 0) return len; -- cgit v1.2.3 From 1e7cd8a53b72a58a44c4d282aed95f6ce0e76db0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 5 Jun 2026 15:53:19 +0200 Subject: simpe_xattr: use per-sb cache Move the hash table to the super block to remove excessive overhead in case of small number of xattrs per inode. Add linked list to the inode, used for listxattr and eviction. Listxattr uses rcu protection to iterate the list of xattrs. Before being made per-sb, lazy allocation was protected by inode lock. Now inode lock no longer provides sufficient exclusion, so use cmpxchg() to ensure atomicity. Though I haven't found a description of this pattern, after some research it seems that cmpxchg_release() and READ_ONCE() should provide the necessary memory barriers. Use simple_xattr_free_rcu() in simple_xattrs_free(). This is needed because the hash table is now shared between inodes and lookup on a different inode might be running the compare function on the just freed element within the RCU grace period. Following stats are based on slabinfo diff, after creating 100k empty files, then adding a "user.test=foo" xattr to each: v7.0 (no rhashtable): File creation: 993.40 bytes/file Xattr addition: 79.99 bytes/file v7.1-rc2 (per-inode rhashtable): File creation: 939.73 bytes/file Xattr addition: 1296.08 bytes/file v7.1-rc2 + this patch (per-sb rhashtable) File creation: 946.84 bytes/file Xattr addition: 111.86 bytes/file The overhead of a single xattr is reduced to nearly v7.0 levels. The per xattr overhead is slightly larger due to the addition of three pointers to struct simple_xattr. Fixes: b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure") Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260605135322.2632068-5-mszeredi@redhat.com Signed-off-by: Christian Brauner (Amutable) --- fs/kernfs/dir.c | 5 +- fs/kernfs/inode.c | 10 +- fs/kernfs/kernfs-internal.h | 4 +- fs/pidfs.c | 14 ++- fs/xattr.c | 283 +++++++++++++++++++++----------------------- include/linux/shmem_fs.h | 3 +- include/linux/xattr.h | 26 ++-- mm/shmem.c | 13 +- net/socket.c | 12 +- 9 files changed, 192 insertions(+), 178 deletions(-) (limited to 'include/linux') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 368dc4a217d9..8ba2f2f3da9e 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -606,7 +606,7 @@ void kernfs_put(struct kernfs_node *kn) kernfs_put(kn->symlink.target_kn); if (kn->iattr) - simple_xattrs_free(&kn->iattr->xattrs, NULL); + simple_xattrs_free(&root->xa_cache, &kn->iattr->xattrs, NULL); spin_lock(&root->kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); @@ -621,6 +621,7 @@ void kernfs_put(struct kernfs_node *kn) } else { /* just released the root kn, free @root too */ idr_destroy(&root->ino_idr); + simple_xattr_cache_cleanup(&root->xa_cache); kfree_rcu(root, rcu); } } @@ -706,7 +707,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, err_out4: if (kn->iattr) { - simple_xattrs_free(&kn->iattr->xattrs, NULL); + simple_xattrs_free(&root->xa_cache, &kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } err_out3: diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index f2298de6bc6f..2cb20294aaf5 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -37,6 +37,7 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc) if (!ret) return NULL; + INIT_LIST_HEAD_RCU(&ret->xattrs); /* assign default attributes */ ret->ia_uid = GLOBAL_ROOT_UID; ret->ia_gid = GLOBAL_ROOT_GID; @@ -296,11 +297,12 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn); + struct simple_xattr_cache *cache = &kernfs_root(kn)->xa_cache; if (!attrs) return -ENODATA; - return simple_xattr_get(&attrs->xattrs, name, value, size); + return simple_xattr_get(cache, &attrs->xattrs, name, value, size); } int kernfs_xattr_set(struct kernfs_node *kn, const char *name, @@ -308,6 +310,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name, { struct simple_xattr *old_xattr; struct kernfs_iattrs *attrs; + struct simple_xattr_cache *cache = &kernfs_root(kn)->xa_cache; attrs = kernfs_iattrs(kn); if (!attrs) @@ -322,7 +325,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name, */ CLASS(kernfs_node_lock, lock)(kn); - old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(cache, &attrs->xattrs, name, value, size, flags); if (IS_ERR(old_xattr)) return PTR_ERR(old_xattr); @@ -372,7 +375,8 @@ static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, /* See comment in kernfs_xattr_set() about locking. */ CLASS(kernfs_node_lock, lock)(kn); - return simple_xattr_set_limited(&attrs->xattrs, &attrs->xattr_limits, + return simple_xattr_set_limited(&kernfs_root(kn)->xa_cache, + &attrs->xattrs, &attrs->xattr_limits, full_name, value, size, flags); } diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 1dc6663553d1..aa784b540b36 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -26,7 +26,7 @@ struct kernfs_iattrs { struct timespec64 ia_mtime; struct timespec64 ia_ctime; - struct simple_xattrs *xattrs; + struct list_head xattrs; struct simple_xattr_limits xattr_limits; }; @@ -54,6 +54,8 @@ struct kernfs_root { rwlock_t kernfs_rename_lock; struct rcu_head rcu; + + struct simple_xattr_cache xa_cache; }; /* +1 to avoid triggering overflow warning when negating it */ diff --git a/fs/pidfs.c b/fs/pidfs.c index eb5105bddeca..143d0aec16af 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -37,6 +37,8 @@ static struct kmem_cache *pidfs_attr_cachep __ro_after_init; static struct path pidfs_root_path = {}; +static struct simple_xattr_cache pidfs_xa_cache; + void pidfs_get_root(struct path *path) { *path = pidfs_root_path; @@ -96,7 +98,7 @@ static const struct rhashtable_params pidfs_ino_ht_params = { * use file handles. */ struct pidfs_attr { - struct simple_xattrs *xattrs; + struct list_head xattrs; union { struct pidfs_anon_attr; struct llist_node pidfs_llist; @@ -196,7 +198,7 @@ static void pidfs_free_attr_work(struct work_struct *work) head = llist_del_all(&pidfs_free_list); llist_for_each_entry_safe(attr, next, head, pidfs_llist) { - simple_xattrs_free(&attr->xattrs, NULL); + simple_xattrs_free(&pidfs_xa_cache, &attr->xattrs, NULL); kfree(attr); } } @@ -224,7 +226,7 @@ void pidfs_free_pid(struct pid *pid) if (IS_ERR(attr)) return; - if (likely(!attr->xattrs)) + if (likely(list_empty(&attr->xattrs))) kfree(attr); else if (llist_add(&attr->pidfs_llist, &pidfs_free_list)) schedule_work(&pidfs_free_work); @@ -1007,6 +1009,8 @@ int pidfs_register_pid(struct pid *pid) if (!new_attr) return -ENOMEM; + INIT_LIST_HEAD_RCU(&new_attr->xattrs); + /* Synchronize with pidfs_exit(). */ guard(spinlock_irq)(&pid->wait_pidfd.lock); @@ -1048,7 +1052,7 @@ static int pidfs_xattr_get(const struct xattr_handler *handler, struct pid *pid = inode->i_private; const char *name = xattr_full_name(handler, suffix); - return simple_xattr_get(&pid->attr->xattrs, name, value, size); + return simple_xattr_get(&pidfs_xa_cache, &pid->attr->xattrs, name, value, size); } static int pidfs_xattr_set(const struct xattr_handler *handler, @@ -1063,7 +1067,7 @@ static int pidfs_xattr_set(const struct xattr_handler *handler, /* Ensure we're the only one to set @attr->xattrs. */ WARN_ON_ONCE(!inode_is_locked(inode)); - old_xattr = simple_xattr_set(&pid->attr->xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(&pidfs_xa_cache, &pid->attr->xattrs, name, value, size, flags); if (IS_ERR(old_xattr)) return PTR_ERR(old_xattr); diff --git a/fs/xattr.c b/fs/xattr.c index 9ef7ad8a8f32..89374cd9029a 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -28,6 +28,11 @@ #include "internal.h" +struct sx_key { + const struct list_head *parent; + const char *name; +}; + static const char * strcmp_prefix(const char *a, const char *a_prefix) { @@ -1269,23 +1274,32 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) return new_xattr; } +static u32 sx_hashfn(const char *name, const struct list_head *parent, u32 seed) +{ + return jhash(name, strlen(name), jhash(&parent, sizeof(parent), seed)); +} + static u32 simple_xattr_hashfn(const void *data, u32 len, u32 seed) { - const char *name = data; - return jhash(name, strlen(name), seed); + const struct sx_key *key = data; + + return sx_hashfn(key->name, key->parent, seed); } static u32 simple_xattr_obj_hashfn(const void *obj, u32 len, u32 seed) { const struct simple_xattr *xattr = obj; - return jhash(xattr->name, strlen(xattr->name), seed); + + return sx_hashfn(xattr->name, xattr->parent, seed); } static int simple_xattr_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct simple_xattr *xattr = obj; - return strcmp(xattr->name, arg->key); + const struct sx_key *key = arg->key; + + return xattr->parent != key->parent || strcmp(xattr->name, key->name); } static const struct rhashtable_params simple_xattr_params = { @@ -1298,6 +1312,7 @@ static const struct rhashtable_params simple_xattr_params = { /** * simple_xattr_get - get an xattr object + * @cache: anchor for the hash table * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @buffer: the buffer to store the value into @@ -1311,19 +1326,19 @@ static const struct rhashtable_params simple_xattr_params = { * Return: On success the length of the xattr value is returned. On error a * negative error code is returned. */ -int simple_xattr_get(struct simple_xattrs **xattrsp, const char *name, - void *buffer, size_t size) +int simple_xattr_get(struct simple_xattr_cache *cache, struct list_head *xattrs, + const char *name, void *buffer, size_t size) { - struct simple_xattrs *xattrs; struct simple_xattr *xattr; + struct sx_key key = { .parent = xattrs, .name = name }; + struct rhashtable *ht = READ_ONCE(cache->ht); int ret = -ENODATA; - xattrs = READ_ONCE(*xattrsp); - if (!xattrs) - return -ENODATA; + if (!ht) + return ret; guard(rcu)(); - xattr = rhashtable_lookup(&xattrs->ht, name, simple_xattr_params); + xattr = rhashtable_lookup(ht, &key, simple_xattr_params); if (xattr) { ret = xattr->size; if (buffer) { @@ -1336,11 +1351,45 @@ int simple_xattr_get(struct simple_xattrs **xattrsp, const char *name, return ret; } -static struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, - const void *value, int flags); +static struct rhashtable *simple_xattrs_lazy_alloc(struct simple_xattr_cache *cache, + const void *value, int flags) +{ + struct rhashtable *oldht, *ht = READ_ONCE(cache->ht); + int err; + + if (unlikely(!ht)) { + if (!value) + return (flags & XATTR_REPLACE) ? ERR_PTR(-ENODATA) : NULL; + + ht = kzalloc_obj(*ht); + if (!ht) + return ERR_PTR(-ENOMEM); + + err = rhashtable_init(ht, &simple_xattr_params); + if (err) { + kfree(ht); + return ERR_PTR(err); + } + + /* + * Provides release semantics on success, so that use of a + * non-NULL READ_ONCE(cache->ht) will be ordered relative to the + * above initialization, due to implicit address dependency. + */ + oldht = cmpxchg_release(&cache->ht, NULL, ht); + if (oldht) { + /* Race lost */ + rhashtable_destroy(ht); + kfree(ht); + ht = oldht; + } + } + return ht; +} /** * simple_xattr_set - set an xattr object + * @cache: anchor for the hash table * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @value: the value to store along the xattr @@ -1370,50 +1419,58 @@ static struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xat * Return: On success, the removed or replaced xattr is returned, to be freed * by the caller; or NULL if none. On failure a negative error code is returned. */ -struct simple_xattr *simple_xattr_set(struct simple_xattrs **xattrsp, +struct simple_xattr *simple_xattr_set(struct simple_xattr_cache *cache, struct list_head *xattrs, const char *name, const void *value, size_t size, int flags) { - struct simple_xattrs *xattrs; + struct sx_key key = { .parent = xattrs, .name = name }; struct simple_xattr *old_xattr = NULL; + struct rhashtable *ht; int err; - xattrs = simple_xattrs_lazy_alloc(xattrsp, value, flags); - if (IS_ERR_OR_NULL(xattrs)) - return ERR_CAST(xattrs); + ht = simple_xattrs_lazy_alloc(cache, value, flags); + if (IS_ERR_OR_NULL(ht)) + return ERR_CAST(ht); CLASS(simple_xattr, new_xattr)(value, size); if (IS_ERR(new_xattr)) return new_xattr; if (new_xattr) { + new_xattr->parent = xattrs; new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) return ERR_PTR(-ENOMEM); } - /* Lookup is safe without RCU here since writes are serialized. */ - old_xattr = rhashtable_lookup_fast(&xattrs->ht, name, - simple_xattr_params); - + /* + * Hash table lookup/replace/remove will grab RCU read lock themselves. + * This makes sure that hash table lookup is safe against concurrent + * modification on another inode. + */ + old_xattr = rhashtable_lookup_fast(ht, &key, simple_xattr_params); if (old_xattr) { /* Fail if XATTR_CREATE is requested and the xattr exists. */ if (flags & XATTR_CREATE) return ERR_PTR(-EEXIST); if (new_xattr) { - err = rhashtable_replace_fast(&xattrs->ht, + err = rhashtable_replace_fast(ht, &old_xattr->hash_node, &new_xattr->hash_node, simple_xattr_params); if (err) return ERR_PTR(err); + + list_replace_rcu(&old_xattr->node, &new_xattr->node); } else { - err = rhashtable_remove_fast(&xattrs->ht, + err = rhashtable_remove_fast(ht, &old_xattr->hash_node, simple_xattr_params); if (err) return ERR_PTR(err); + + list_del_rcu(&old_xattr->node); } } else { /* Fail if XATTR_REPLACE is requested but no xattr is found. */ @@ -1425,11 +1482,13 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs **xattrsp, * new value simply insert it. */ if (new_xattr) { - err = rhashtable_insert_fast(&xattrs->ht, + err = rhashtable_insert_fast(ht, &new_xattr->hash_node, simple_xattr_params); if (err) return ERR_PTR(err); + + list_add_tail_rcu(&new_xattr->node, xattrs); } /* @@ -1466,6 +1525,7 @@ static inline int simple_xattr_limits_inc(struct simple_xattr_limits *limits, /** * simple_xattr_set_limited - set an xattr with per-inode user.* limits + * @cache: anchor for the hash table * @xattrs: the header of the xattr object * @limits: per-inode limit counters for user.* xattrs * @name: the name of the xattr to set or remove @@ -1480,7 +1540,7 @@ static inline int simple_xattr_limits_inc(struct simple_xattr_limits *limits, * Return: On success zero is returned. On failure a negative error code is * returned. */ -int simple_xattr_set_limited(struct simple_xattrs **xattrs, +int simple_xattr_set_limited(struct simple_xattr_cache *cache, struct list_head *xattrs, struct simple_xattr_limits *limits, const char *name, const void *value, size_t size, int flags) @@ -1494,7 +1554,7 @@ int simple_xattr_set_limited(struct simple_xattrs **xattrs, return ret; } - old_xattr = simple_xattr_set(xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(cache, xattrs, name, value, size, flags); if (IS_ERR(old_xattr)) { if (value) simple_xattr_limits_dec(limits, size); @@ -1540,12 +1600,10 @@ static bool xattr_is_maclabel(const char *name) * Return: On success the required size or the size of the copied xattrs is * returned. On error a negative error code is returned. */ -ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs **xattrsp, +ssize_t simple_xattr_list(struct inode *inode, struct list_head *xattrs, char *buffer, size_t size) { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); - struct simple_xattrs *xattrs; - struct rhashtable_iter iter; struct simple_xattr *xattr; ssize_t remaining_size = size; int err = 0; @@ -1566,21 +1624,11 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs **xattrsp, remaining_size -= err; err = 0; - xattrs = READ_ONCE(*xattrsp); if (!xattrs) return size - remaining_size; - rhashtable_walk_enter(&xattrs->ht, &iter); - rhashtable_walk_start(&iter); - - while ((xattr = rhashtable_walk_next(&iter)) != NULL) { - if (IS_ERR(xattr)) { - if (PTR_ERR(xattr) == -EAGAIN) - continue; - err = PTR_ERR(xattr); - break; - } - + rcu_read_lock(); + list_for_each_entry_rcu(xattr, xattrs, node) { /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; @@ -1593,15 +1641,14 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs **xattrsp, if (err) break; } - - rhashtable_walk_stop(&iter); - rhashtable_walk_exit(&iter); + rcu_read_unlock(); return err ? err : size - remaining_size; } /** * simple_xattr_add - add xattr objects + * @cache: anchor for the hash table * @xattrs: the header of the xattr object * @new_xattr: the xattr object to add * @@ -1612,125 +1659,67 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs **xattrsp, * Return: On success zero is returned. On failure a negative error code is * returned. */ -int simple_xattr_add(struct simple_xattrs **xattrsp, +int simple_xattr_add(struct simple_xattr_cache *cache, struct list_head *xattrs, struct simple_xattr *new_xattr) { - struct simple_xattrs *xattrs; - - xattrs = simple_xattrs_lazy_alloc(xattrsp, new_xattr->value, 0); - if (IS_ERR(xattrs)) - return PTR_ERR(xattrs); - - return rhashtable_insert_fast(&xattrs->ht, &new_xattr->hash_node, - simple_xattr_params); -} - -/** - * simple_xattrs_init - initialize new xattr header - * @xattrs: header to initialize - * - * Initialize the rhashtable used to store xattr objects. - * - * Return: On success zero is returned. On failure a negative error code is - * returned. - */ -int simple_xattrs_init(struct simple_xattrs *xattrs) -{ - return rhashtable_init(&xattrs->ht, &simple_xattr_params); -} - -/** - * simple_xattrs_alloc - allocate and initialize a new xattr header - * - * Dynamically allocate a simple_xattrs header and initialize the - * underlying rhashtable. This is intended for consumers that want - * to lazily allocate xattr storage only when the first xattr is set, - * avoiding the per-inode rhashtable overhead when no xattrs are used. - * - * Return: On success a new simple_xattrs is returned. On failure an - * ERR_PTR is returned. - */ -static struct simple_xattrs *simple_xattrs_alloc(void) -{ - struct simple_xattrs *xattrs __free(kfree) = NULL; - int ret; + struct rhashtable *ht; + int err; - xattrs = kzalloc(sizeof(*xattrs), GFP_KERNEL); - if (!xattrs) - return ERR_PTR(-ENOMEM); + ht = simple_xattrs_lazy_alloc(cache, new_xattr->value, 0); + if (IS_ERR(ht)) + return PTR_ERR(ht); - ret = simple_xattrs_init(xattrs); - if (ret) - return ERR_PTR(ret); + new_xattr->parent = xattrs; + err = rhashtable_insert_fast(ht, &new_xattr->hash_node, simple_xattr_params); + if (err) + return err; - return no_free_ptr(xattrs); + list_add_tail_rcu(&new_xattr->node, xattrs); + return 0; } /** - * simple_xattrs_lazy_alloc - get or allocate xattrs for a set operation - * @xattrsp: pointer to the xattrs pointer (may point to NULL) - * @value: value being set (NULL means remove) - * @flags: xattr set flags - * - * For lazily-allocated xattrs on the write path. If no xattrs exist yet - * and this is a remove operation, returns the appropriate result without - * allocating. Otherwise ensures xattrs is allocated and published with - * store-release semantics. + * simple_xattrs_free - free xattrs + * @cache: anchor for the hash table + * @xattrs: xattr header whose xattrs to destroy + * @freed_space: approximate number of bytes of memory freed from @xattrs * - * Return: On success a valid pointer to the xattrs is returned. On - * failure or early-exit an ERR_PTR or NULL is returned. Callers should - * check with IS_ERR_OR_NULL() and propagate with PTR_ERR() which - * correctly returns 0 for the NULL no-op case. + * Destroy all xattrs in @xattrs. When this is called no one can hold a + * reference to any of the xattrs anymore. */ -static struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, - const void *value, int flags) +void simple_xattrs_free(struct simple_xattr_cache *cache, struct list_head *xattrs, + size_t *freed_space) { - struct simple_xattrs *xattrs; - - xattrs = READ_ONCE(*xattrsp); - if (xattrs) - return xattrs; - - if (!value) - return (flags & XATTR_REPLACE) ? ERR_PTR(-ENODATA) : NULL; - - xattrs = simple_xattrs_alloc(); - if (!IS_ERR(xattrs)) - smp_store_release(xattrsp, xattrs); - return xattrs; -} + if (freed_space) + *freed_space = 0; -static void simple_xattr_ht_free(void *ptr, void *arg) -{ - struct simple_xattr *xattr = ptr; - size_t *freed_space = arg; + while (!list_empty(xattrs)) { + struct simple_xattr *xattr = list_first_entry(xattrs, typeof(*xattr), node); - if (freed_space) - *freed_space += simple_xattr_space(xattr->name, xattr->size); - simple_xattr_free(xattr); + rhashtable_remove_fast(cache->ht, &xattr->hash_node, simple_xattr_params); + list_del(&xattr->node); + if (freed_space) + *freed_space += simple_xattr_space(xattr->name, xattr->size); + /* + * Free with RCU, since the xattr might still get accessed by + * the hash compare function + */ + simple_xattr_free_rcu(xattr); + } } /** - * simple_xattrs_free - free xattrs - * @xattrs: xattr header whose xattrs to destroy - * @freed_space: approximate number of bytes of memory freed from @xattrs + * simple_xattr_cache_cleanup - free the cache + * @cache: anchor for the hash table * - * Destroy all xattrs in @xattr. When this is called no one can hold a - * reference to any of the xattrs anymore. + * Destroy the cache table, which was lazily allocated on adding the first xattr. */ -void simple_xattrs_free(struct simple_xattrs **xattrsp, size_t *freed_space) +void simple_xattr_cache_cleanup(struct simple_xattr_cache *cache) { - struct simple_xattrs *xattrs = *xattrsp; - - might_sleep(); - - if (!xattrs) - return; - - if (freed_space) - *freed_space = 0; - rhashtable_free_and_destroy(&xattrs->ht, simple_xattr_ht_free, - freed_space); - kfree(xattrs); - *xattrsp = NULL; + if (cache->ht) { + WARN_ON(atomic_read(&cache->ht->nelems)); + rhashtable_destroy(cache->ht); + kfree(cache->ht); + cache->ht = NULL; + } } diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 93a0ba872ebe..69b0177da156 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -48,7 +48,7 @@ struct shmem_inode_info { }; struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ - struct simple_xattrs *xattrs; /* list of xattrs */ + struct list_head xattrs; /* list of xattrs */ pgoff_t fallocend; /* highest fallocate endindex */ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ @@ -89,6 +89,7 @@ struct shmem_sb_info { struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ struct shmem_quota_limits qlimits; /* Default quota limits */ + struct simple_xattr_cache xa_cache; }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) diff --git a/include/linux/xattr.h b/include/linux/xattr.h index ded446c1ef81..7aaaf4f8aff5 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -106,12 +106,14 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler) return handler->prefix ?: handler->name; } -struct simple_xattrs { - struct rhashtable ht; +struct simple_xattr_cache { + struct rhashtable *ht; }; struct simple_xattr { struct rhash_head hash_node; + struct list_head *parent; + struct list_head node; struct rcu_head rcu; char *name; size_t size; @@ -132,27 +134,31 @@ static inline void simple_xattr_limits_init(struct simple_xattr_limits *limits) atomic_set(&limits->xattr_size, 0); } -int simple_xattrs_init(struct simple_xattrs *xattrs); -void simple_xattrs_free(struct simple_xattrs **xattrs, size_t *freed_space); +void simple_xattrs_free(struct simple_xattr_cache *cache, struct list_head *xattrs, + size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); void simple_xattr_free_rcu(struct simple_xattr *xattr); -int simple_xattr_get(struct simple_xattrs **xattrs, const char *name, - void *buffer, size_t size); -struct simple_xattr *simple_xattr_set(struct simple_xattrs **xattrs, +int simple_xattr_get(struct simple_xattr_cache *cache, struct list_head *xattrs, + const char *name, void *buffer, size_t size); +struct simple_xattr *simple_xattr_set(struct simple_xattr_cache *cache, + struct list_head *xattrs, const char *name, const void *value, size_t size, int flags); -int simple_xattr_set_limited(struct simple_xattrs **xattrs, +int simple_xattr_set_limited(struct simple_xattr_cache *cache, + struct list_head *xattrs, struct simple_xattr_limits *limits, const char *name, const void *value, size_t size, int flags); -ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs **xattrs, +ssize_t simple_xattr_list(struct inode *inode, struct list_head *xattrs, char *buffer, size_t size); -int simple_xattr_add(struct simple_xattrs **xattrs, +int simple_xattr_add(struct simple_xattr_cache *cache, struct list_head *xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); +void simple_xattr_cache_cleanup(struct simple_xattr_cache *cache); + DEFINE_CLASS(simple_xattr, struct simple_xattr *, if (!IS_ERR_OR_NULL(_T)) simple_xattr_free(_T), diff --git a/mm/shmem.c b/mm/shmem.c index cf4ee9f41191..7b1ea9fb598f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1425,7 +1425,7 @@ static void shmem_evict_inode(struct inode *inode) } } - simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); + simple_xattrs_free(&sbinfo->xa_cache, &info->xattrs, sbinfo->max_inodes ? &freed : NULL); shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); @@ -3084,6 +3084,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, inode->i_generation = get_random_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); + INIT_LIST_HEAD_RCU(&info->xattrs); spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; @@ -4258,7 +4259,7 @@ static int shmem_initxattrs(struct inode *inode, if (!new_xattr->name) break; - if (simple_xattr_add(&info->xattrs, new_xattr)) + if (simple_xattr_add(&sbinfo->xa_cache, &info->xattrs, new_xattr)) break; if (sbinfo->max_inodes) @@ -4283,10 +4284,11 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); name = xattr_full_name(handler, name); - return simple_xattr_get(&info->xattrs, name, buffer, size); + return simple_xattr_get(&sbinfo->xa_cache, &info->xattrs, name, buffer, size); } static int shmem_xattr_handler_set(const struct xattr_handler *handler, @@ -4314,7 +4316,7 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, return -ENOSPC; } - old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(&sbinfo->xa_cache, &info->xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { ispace = 0; if (old_xattr && sbinfo->max_inodes) @@ -4963,6 +4965,9 @@ static void shmem_put_super(struct super_block *sb) free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); +#ifdef CONFIG_TMPFS_XATTR + simple_xattr_cache_cleanup(&sbinfo->xa_cache); +#endif kfree(sbinfo); sb->s_fs_info = NULL; } diff --git a/net/socket.c b/net/socket.c index d3597c858345..a8014f930d9e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -310,8 +310,10 @@ efault_end: static struct kmem_cache *sock_inode_cachep __ro_after_init; +static struct simple_xattr_cache sockfs_xa_cache; + struct sockfs_inode { - struct simple_xattrs *xattrs; + struct list_head xattrs; struct simple_xattr_limits xattr_limits; struct socket_alloc; }; @@ -328,7 +330,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb) si = alloc_inode_sb(sb, sock_inode_cachep, GFP_KERNEL); if (!si) return NULL; - si->xattrs = NULL; + INIT_LIST_HEAD_RCU(&si->xattrs); simple_xattr_limits_init(&si->xattr_limits); init_waitqueue_head(&si->socket.wq.wait); @@ -348,7 +350,7 @@ static void sock_evict_inode(struct inode *inode) { struct sockfs_inode *si = SOCKFS_I(inode); - simple_xattrs_free(&si->xattrs, NULL); + simple_xattrs_free(&sockfs_xa_cache, &si->xattrs, NULL); clear_inode(inode); } @@ -441,7 +443,7 @@ static int sockfs_user_xattr_get(const struct xattr_handler *handler, const char *name = xattr_full_name(handler, suffix); struct sockfs_inode *si = SOCKFS_I(inode); - return simple_xattr_get(&si->xattrs, name, value, size); + return simple_xattr_get(&sockfs_xa_cache, &si->xattrs, name, value, size); } static int sockfs_user_xattr_set(const struct xattr_handler *handler, @@ -453,7 +455,7 @@ static int sockfs_user_xattr_set(const struct xattr_handler *handler, const char *name = xattr_full_name(handler, suffix); struct sockfs_inode *si = SOCKFS_I(inode); - return simple_xattr_set_limited(&si->xattrs, &si->xattr_limits, + return simple_xattr_set_limited(&sockfs_xa_cache, &si->xattrs, &si->xattr_limits, name, value, size, flags); } -- cgit v1.2.3 From 9722955b54307e9070994f2382ec06af3d7405e0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2026 09:40:12 +0200 Subject: bpf: Add simple xattr support to bpffs Add support for extended attributes on bpffs inodes so that user space and BPF LSM programs can attach metadata, for example, a content hash or a security label - to a pinned object or directory. BPF LSM or user space tooling can then uniformly look at this (e.g. security.bpf.*) in similar way to other fs'es. The store is in-memory and non-persistent: it lives only for the lifetime of the mount, like everything else in bpffs. The modelling is similar to tmpfs. bpffs serves the trusted.* and security.* namespaces; user.* is left unsupported. As bpffs is FS_USERNS_MOUNT, security.* is reachable by the unprivileged mounter in a user namespace, and thus we are using the simple_xattr_set_limited infra there (trusted.* needs global CAP_SYS_ADMIN). bpf_fill_super() is open-coded instead of using simple_fill_super(), because the root inode must now be allocated through bpf_fs_alloc_inode() i.e. carry the bpf_fs_inode wrapper and come from the right cache - which requires s_op (and s_xattr) to be installed before the first inode is created. While at it, also harden s_iflags with SB_I_NOEXEC and SB_I_NODEV. bpf_fs_listxattr() is only reachable through the filesystem via i_op->listxattr, so the BPF token inode is left untouched. Name-based fsetxattr()/fgetxattr() on a token fd still work since the get/set handlers are installed at the superblock. For security.* namespace, we use simple_xattr_set_limited() but there was no simple_xattr_add_limited() API yet which was needed in bpf_fs_initxattrs() to avoid underflows in the accounting. The symlink target is freed in bpf_free_inode() rather than in bpf_destroy_inode() so that it is released only after an RCU grace period, as an RCU path walk following the symlink may still dereference inode->i_link in security_inode_follow_link(). Lastly, the bpf_symlink() allocated the symlink target is switched to GFP_KERNEL_ACCOUNT, so the string is charged to the caller's memcg. Signed-off-by: Daniel Borkmann Link: https://patch.msgid.link/20260602074012.416289-1-daniel@iogearbox.net Cc: Christian Brauner Signed-off-by: Christian Brauner (Amutable) --- fs/xattr.c | 33 +++++++ include/linux/bpf.h | 3 + include/linux/xattr.h | 4 + kernel/bpf/inode.c | 256 ++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 277 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/xattr.c b/fs/xattr.c index 89374cd9029a..ec2a4f3759d8 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1678,6 +1678,39 @@ int simple_xattr_add(struct simple_xattr_cache *cache, struct list_head *xattrs, return 0; } +/** + * simple_xattr_add_limited - add an xattr object, charging per-inode limits + * @cache: anchor for the hash table + * @xattrs: the header of the xattr object + * @limits: per-inode limit counters + * @new_xattr: the xattr object to add + * + * Like simple_xattr_add(), but also accounts @new_xattr against @limits so + * that a later removal or replacement of it through simple_xattr_set_limited() + * decrements counters that were actually incremented, rather than underflowing + * them. Use this instead of simple_xattr_add() when seeding initial xattrs + * that share a namespace with the limited set/remove path. + * + * Return: On success zero is returned. On failure a negative error code is + * returned. + */ +int simple_xattr_add_limited(struct simple_xattr_cache *cache, + struct list_head *xattrs, + struct simple_xattr_limits *limits, + struct simple_xattr *new_xattr) +{ + int err; + + err = simple_xattr_limits_inc(limits, new_xattr->size); + if (err) + return err; + + err = simple_xattr_add(cache, xattrs, new_xattr); + if (err) + simple_xattr_limits_dec(limits, new_xattr->size); + return err; +} + /** * simple_xattrs_free - free xattrs * @cache: anchor for the hash table diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4b703c90ca9..434ba91401c6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ #include #include #include +#include #include struct bpf_verifier_env; @@ -1918,6 +1919,8 @@ struct bpf_mount_opts { u64 delegate_maps; u64 delegate_progs; u64 delegate_attachs; + + struct simple_xattr_cache xa_cache; }; struct bpf_token { diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 7aaaf4f8aff5..54ac3cbc133f 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -155,6 +155,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct list_head *xattrs, char *buffer, size_t size); int simple_xattr_add(struct simple_xattr_cache *cache, struct list_head *xattrs, struct simple_xattr *new_xattr); +int simple_xattr_add_limited(struct simple_xattr_cache *cache, + struct list_head *xattrs, + struct simple_xattr_limits *limits, + struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); void simple_xattr_cache_cleanup(struct simple_xattr_cache *cache); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 25c06a011825..c3f79b5a2f8c 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -21,6 +21,9 @@ #include #include #include +#include +#include + #include "preload/bpf_preload.h" enum bpf_type { @@ -30,6 +33,23 @@ enum bpf_type { BPF_TYPE_LINK, }; +struct bpf_fs_inode { + struct list_head xattrs; + struct simple_xattr_limits xlimits; + struct inode vfs_inode; +}; + +static inline struct bpf_fs_inode *BPF_FS_I(struct inode *inode) +{ + return container_of(inode, struct bpf_fs_inode, vfs_inode); +} + +static struct kmem_cache *bpf_fs_inode_cachep __ro_after_init; + +static int bpf_fs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info); +static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size); + static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { @@ -94,10 +114,17 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) } static const struct inode_operations bpf_dir_iops; +static const struct inode_operations bpf_symlink_iops; -static const struct inode_operations bpf_prog_iops = { }; -static const struct inode_operations bpf_map_iops = { }; -static const struct inode_operations bpf_link_iops = { }; +static const struct inode_operations bpf_prog_iops = { + .listxattr = bpf_fs_listxattr, +}; +static const struct inode_operations bpf_map_iops = { + .listxattr = bpf_fs_listxattr, +}; +static const struct inode_operations bpf_link_iops = { + .listxattr = bpf_fs_listxattr, +}; struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, @@ -153,11 +180,19 @@ static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; + int ret; inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) return ERR_CAST(inode); + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ERR_PTR(ret); + } + inode->i_op = &bpf_dir_iops; inode->i_fop = &simple_dir_operations; @@ -330,10 +365,20 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, const struct file_operations *fops) { struct inode *dir = dentry->d_parent->d_inode; - struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); + struct inode *inode; + int ret; + + inode = bpf_get_inode(dir->i_sb, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ret; + } + inode->i_op = iops; inode->i_fop = fops; inode->i_private = raw; @@ -382,9 +427,11 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *target) { - char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); struct inode *inode; + char *link; + int ret; + link = kstrdup(target, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -394,13 +441,25 @@ static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, return PTR_ERR(inode); } - inode->i_op = &simple_symlink_inode_operations; + inode->i_op = &bpf_symlink_iops; inode->i_link = link; + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ret; + } + bpf_dentry_finalize(dentry, inode, dir); return 0; } +static const struct inode_operations bpf_symlink_iops = { + .get_link = simple_get_link, + .listxattr = bpf_fs_listxattr, +}; + static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, .mkdir = bpf_mkdir, @@ -409,6 +468,7 @@ static const struct inode_operations bpf_dir_iops = { .rename = simple_rename, .link = simple_link, .unlink = simple_unlink, + .listxattr = bpf_fs_listxattr, }; /* pin iterator link into bpffs */ @@ -762,22 +822,147 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } +static struct inode *bpf_fs_alloc_inode(struct super_block *sb) +{ + struct bpf_fs_inode *bi; + + bi = alloc_inode_sb(sb, bpf_fs_inode_cachep, GFP_KERNEL); + if (!bi) + return NULL; + INIT_LIST_HEAD_RCU(&bi->xattrs); + simple_xattr_limits_init(&bi->xlimits); + return &bi->vfs_inode; +} + static void bpf_destroy_inode(struct inode *inode) { + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); enum bpf_type type; - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); - free_inode_nonrcu(inode); + simple_xattrs_free(&opts->xa_cache, &bi->xattrs, NULL); +} + +static void bpf_free_inode(struct inode *inode) +{ + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); + kmem_cache_free(bpf_fs_inode_cachep, BPF_FS_I(inode)); +} + +static int bpf_fs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + + name = xattr_full_name(handler, name); + return simple_xattr_get(&opts->xa_cache, &bi->xattrs, name, value, size); +} + +enum { + BPF_FS_XATTR_UNSPEC, + BPF_FS_XATTR_SECURITY, + BPF_FS_XATTR_TRUSTED, +}; + +static int bpf_fs_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + struct simple_xattr *old; + int err = -EINVAL; + + name = xattr_full_name(handler, name); + switch (handler->flags) { + case BPF_FS_XATTR_SECURITY: + err = simple_xattr_set_limited(&opts->xa_cache, &bi->xattrs, + &bi->xlimits, name, value, size, + flags); + break; + case BPF_FS_XATTR_TRUSTED: + old = simple_xattr_set(&opts->xa_cache, &bi->xattrs, name, + value, size, flags); + err = IS_ERR(old) ? PTR_ERR(old) : 0; + if (!err) + simple_xattr_free_rcu(old); + break; + } + if (err) + return err; + inode_set_ctime_current(inode); + return 0; +} + +static const struct xattr_handler bpf_fs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = BPF_FS_XATTR_TRUSTED, + .get = bpf_fs_xattr_get, + .set = bpf_fs_xattr_set, +}; + +static const struct xattr_handler bpf_fs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = BPF_FS_XATTR_SECURITY, + .get = bpf_fs_xattr_get, + .set = bpf_fs_xattr_set, +}; + +static const struct xattr_handler * const bpf_fs_xattr_handlers[] = { + &bpf_fs_trusted_xattr_handler, + &bpf_fs_security_xattr_handler, + NULL, +}; + +static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct inode *inode = d_inode(dentry); + + return simple_xattr_list(inode, &BPF_FS_I(inode)->xattrs, buf, size); +} + +static int bpf_fs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + const struct xattr *xattr; + int err; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len); + if (IS_ERR(new_xattr)) + return PTR_ERR(new_xattr); + + new_xattr->name = kasprintf(GFP_KERNEL_ACCOUNT, + XATTR_SECURITY_PREFIX "%s", + xattr->name); + if (!new_xattr->name) + return -ENOMEM; + + err = simple_xattr_add_limited(&opts->xa_cache, &bi->xattrs, + &bi->xlimits, new_xattr); + if (err) + return err; + + retain_and_null_ptr(new_xattr); + } + return 0; } const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .show_options = bpf_show_options, + .alloc_inode = bpf_fs_alloc_inode, .destroy_inode = bpf_destroy_inode, + .free_inode = bpf_free_inode, }; enum { @@ -996,25 +1181,38 @@ out: static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { - static const struct tree_descr bpf_rfiles[] = { { "" } }; struct bpf_mount_opts *opts = sb->s_fs_info; struct inode *inode; - int ret; /* Mounting an instance of BPF FS requires privileges */ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) return -EPERM; - ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); - if (ret) - return ret; - + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = BPF_FS_MAGIC; sb->s_op = &bpf_super_ops; + sb->s_xattr = bpf_fs_xattr_handlers; + sb->s_iflags |= SB_I_NOEXEC; + sb->s_iflags |= SB_I_NODEV; + sb->s_time_gran = 1; + + inode = bpf_get_inode(sb, NULL, S_IFDIR | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_ino = 1; + inode->i_op = &bpf_dir_iops; + inode->i_fop = &simple_dir_operations; + set_nlink(inode, 2); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; - inode = sb->s_root->d_inode; + inode = d_inode(sb->s_root); inode->i_uid = opts->uid; inode->i_gid = opts->gid; - inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); inode->i_mode |= S_ISVTX | opts->mode; @@ -1068,6 +1266,7 @@ static void bpf_kill_super(struct super_block *sb) struct bpf_mount_opts *opts = sb->s_fs_info; kill_anon_super(sb); + simple_xattr_cache_cleanup(&opts->xa_cache); kfree(opts); } @@ -1080,18 +1279,37 @@ static struct file_system_type bpf_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; +static void bpf_fs_inode_init_once(void *foo) +{ + struct bpf_fs_inode *bi = foo; + + inode_init_once(&bi->vfs_inode); +} + static int __init bpf_init(void) { int ret; + bpf_fs_inode_cachep = kmem_cache_create("bpf_fs_inode_cache", + sizeof(struct bpf_fs_inode), + 0, SLAB_ACCOUNT, + bpf_fs_inode_init_once); + if (!bpf_fs_inode_cachep) + return -ENOMEM; + ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) - return ret; + goto out_cache; ret = register_filesystem(&bpf_fs_type); - if (ret) + if (ret) { sysfs_remove_mount_point(fs_kobj, "bpf"); + goto out_cache; + } + return 0; +out_cache: + kmem_cache_destroy(bpf_fs_inode_cachep); return ret; } fs_initcall(bpf_init); -- cgit v1.2.3