From 1fa4e69a54a250fa17d2afd9c5b54a59329033c1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:48:36 +0100 Subject: filelock: use a consume fence in locks_inode_context() Matches the idiom of storing a pointer with a release fence and safely getting the content with a consume fence after. Eliminates an actual fence on some archs. Reviewed-by: Jeff Layton Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203094837.290654-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/filelock.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 54b824c05299..dc15f5427680 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -241,7 +241,10 @@ bool locks_owner_has_blockers(struct file_lock_context *flctx, static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { - return smp_load_acquire(&inode->i_flctx); + /* + * Paired with the fence in locks_get_lock_context(). + */ + return READ_ONCE(inode->i_flctx); } #else /* !CONFIG_FILE_LOCKING */ -- cgit v1.2.3 From 887e97745ec336c2f49b6c0af3c4cc00a5df3211 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:48:37 +0100 Subject: fs: track the inode having file locks with a flag in ->i_opflags Opening and closing an inode dirties the ->i_readcount field. Depending on the alignment of the inode, it may happen to false-share with other fields loaded both for both operations to various extent. This notably concerns the ->i_flctx field. Since most inodes don't have the field populated, this bit can be managed with a flag in ->i_opflags instead which bypasses the problem. Here are results I obtained while opening a file read-only in a loop with 24 cores doing the work on Sapphire Rapids. Utilizing the flag as opposed to reading ->i_flctx field was toggled at runtime as the benchmark was running, to make sure both results come from the same alignment. before: 3233740 after: 3373346 (+4%) before: 3284313 after: 3518711 (+7%) before: 3505545 after: 4092806 (+16%) Or to put it differently, this varies wildly depending on how (un)lucky you get. The primary bottleneck before and after is the avoidable lockref trip in do_dentry_open(). Reviewed-by: Jeff Layton Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203094837.290654-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/locks.c | 14 ++++++++++++-- include/linux/filelock.h | 15 +++++++++++---- include/linux/fs.h | 1 + 3 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/locks.c b/fs/locks.c index 9f565802a88c..7a63fa3ca9b4 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -178,7 +178,6 @@ locks_get_lock_context(struct inode *inode, int type) { struct file_lock_context *ctx; - /* paired with cmpxchg() below */ ctx = locks_inode_context(inode); if (likely(ctx) || type == F_UNLCK) goto out; @@ -196,7 +195,18 @@ locks_get_lock_context(struct inode *inode, int type) * Assign the pointer if it's not already assigned. If it is, then * free the context we just allocated. */ - if (cmpxchg(&inode->i_flctx, NULL, ctx)) { + spin_lock(&inode->i_lock); + if (!(inode->i_opflags & IOP_FLCTX)) { + VFS_BUG_ON_INODE(inode->i_flctx, inode); + WRITE_ONCE(inode->i_flctx, ctx); + /* + * Paired with locks_inode_context(). + */ + smp_store_release(&inode->i_opflags, inode->i_opflags | IOP_FLCTX); + spin_unlock(&inode->i_lock); + } else { + VFS_BUG_ON_INODE(!inode->i_flctx, inode); + spin_unlock(&inode->i_lock); kmem_cache_free(flctx_cache, ctx); ctx = locks_inode_context(inode); } diff --git a/include/linux/filelock.h b/include/linux/filelock.h index dc15f5427680..4a8912b9653e 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -242,8 +242,12 @@ static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { /* - * Paired with the fence in locks_get_lock_context(). + * Paired with smp_store_release in locks_get_lock_context(). + * + * Ensures ->i_flctx will be visible if we spotted the flag. */ + if (likely(!(smp_load_acquire(&inode->i_opflags) & IOP_FLCTX))) + return NULL; return READ_ONCE(inode->i_flctx); } @@ -471,7 +475,7 @@ static inline int break_lease(struct inode *inode, unsigned int mode) * could end up racing with tasks trying to set a new lease on this * file. */ - flctx = READ_ONCE(inode->i_flctx); + flctx = locks_inode_context(inode); if (!flctx) return 0; smp_mb(); @@ -490,7 +494,7 @@ static inline int break_deleg(struct inode *inode, unsigned int flags) * could end up racing with tasks trying to set a new lease on this * file. */ - flctx = READ_ONCE(inode->i_flctx); + flctx = locks_inode_context(inode); if (!flctx) return 0; smp_mb(); @@ -535,8 +539,11 @@ static inline int break_deleg_wait(struct delegated_inode *di) static inline int break_layout(struct inode *inode, bool wait) { + struct file_lock_context *flctx; + smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) { + flctx = locks_inode_context(inode); + if (flctx && !list_empty_careful(&flctx->flc_lease)) { unsigned int flags = LEASE_BREAK_LAYOUT; if (!wait) diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..094b0adcb035 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -631,6 +631,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 #define IOP_FASTPERM_MAY_EXEC 0x0080 +#define IOP_FLCTX 0x0100 /* * Inode state bits. Protected by inode->i_lock -- cgit v1.2.3 From c0aac5975bafc86f6817b14e9f71dcb5064a9183 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:28:50 +0100 Subject: ns: pad refcount Note no effort is made to make sure structs embedding the namespace are themselves aligned, so this is not guaranteed to eliminate cacheline bouncing due to refcount management. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203092851.287617-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/ns/ns_common_types.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h index b332b019b29c..0014fbc1c626 100644 --- a/include/linux/ns/ns_common_types.h +++ b/include/linux/ns/ns_common_types.h @@ -108,11 +108,13 @@ extern const struct proc_ns_operations utsns_operations; * @ns_tree: namespace tree nodes and active reference count */ struct ns_common { + struct { + refcount_t __ns_ref; /* do not use directly */ + } ____cacheline_aligned_in_smp; u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; - refcount_t __ns_ref; /* do not use directly */ union { struct ns_tree; struct rcu_head ns_rcu; -- cgit v1.2.3 From 0f166bf1d6d82701cc1d94445cc2a9107d1790df Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 23 Dec 2025 08:00:39 +0100 Subject: select: store end_time as timespec64 in restart block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Storing the end time seconds as 'unsigned long' can lead to truncation on 32-bit architectures if assigned from the 64-bit timespec64::tv_sec. As the select() core uses timespec64 consistently, also use that in the restart block. This also allows the simplification of the accessors. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20251223-restart-block-expiration-v2-1-8e33e5df7359@linutronix.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/select.c | 12 ++++-------- include/linux/restart_block.h | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/select.c b/fs/select.c index 65019b8ba3f7..78a1508c84d3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1038,14 +1038,11 @@ static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; - struct timespec64 *to = NULL, end_time; + struct timespec64 *to = NULL; int ret; - if (restart_block->poll.has_timeout) { - end_time.tv_sec = restart_block->poll.tv_sec; - end_time.tv_nsec = restart_block->poll.tv_nsec; - to = &end_time; - } + if (restart_block->poll.has_timeout) + to = &restart_block->poll.end_time; ret = do_sys_poll(ufds, nfds, to); @@ -1077,8 +1074,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { - restart_block->poll.tv_sec = end_time.tv_sec; - restart_block->poll.tv_nsec = end_time.tv_nsec; + restart_block->poll.end_time = end_time; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 67d2bf579942..9b262109726d 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -6,6 +6,7 @@ #define __LINUX_RESTART_BLOCK_H #include +#include #include struct __kernel_timespec; @@ -50,8 +51,7 @@ struct restart_block { struct pollfd __user *ufds; int nfds; int has_timeout; - unsigned long tv_sec; - unsigned long tv_nsec; + struct timespec64 end_time; } poll; }; }; -- cgit v1.2.3 From 6784f274722559c0cdaaa418bc8b7b1d61c314f9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 7 Jan 2026 06:06:36 -0800 Subject: device_cgroup: remove branch hint after code refactor commit 4ef4ac360101 ("device_cgroup: avoid access to ->i_rdev in the common case in devcgroup_inode_permission()") reordered the checks in devcgroup_inode_permission() to check the inode mode before checking i_rdev, for better cache behavior. However, the likely() annotation on the i_rdev check was not updated to reflect the new code flow. Originally, when i_rdev was checked first, likely(!inode->i_rdev) made sense because most inodes were(?) regular files/directories, thus i_rdev == 0. After the reorder, by the time we reach the i_rdev check, we have already confirmed the inode IS a block or character device. Block and character special files are precisely defined by having a device number (i_rdev), so !inode->i_rdev is now the rare edge case, not the common case. Branch profiling confirmed this is 100% mispredicted: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 0 2631904 100 devcgroup_inode_permission device_cgroup.h 24 Remove likely() to avoid giving the wrong hint to the CPU. Fixes: 4ef4ac360101 ("device_cgroup: avoid access to ->i_rdev in the common case in devcgroup_inode_permission()") Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260107-likely_device-v1-1-0c55f83a7e47@debian.org Reviewed-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/device_cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 0864773a57e8..822085bc2d20 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -21,7 +21,7 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask) if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))) return 0; - if (likely(!inode->i_rdev)) + if (!inode->i_rdev) return 0; if (S_ISBLK(inode->i_mode)) -- cgit v1.2.3 From 5e7fa6bfa9b5ced6868fc652d5c40fe0eac154d9 Mon Sep 17 00:00:00 2001 From: André Almeida Date: Mon, 12 Jan 2026 22:51:24 -0300 Subject: exportfs: Fix kernel-doc output for get_name() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without a space between %NAME_MAX and the plus sign, kernel-doc will output ``NAME_MAX``+1, which scapes the last backtick and make Sphinx format a much larger string as monospaced text. Signed-off-by: André Almeida Link: https://patch.msgid.link/20260112-tonyk-fs_uuid-v1-1-acc1889de772@igalia.com Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index f0cf2714ec52..599ea86363e1 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -234,7 +234,7 @@ struct handle_to_path_ctx { * get_name: * @get_name should find a name for the given @child in the given @parent * directory. The name should be stored in the @name (with the - * understanding that it is already pointing to a %NAME_MAX+1 sized + * understanding that it is already pointing to a %NAME_MAX + 1 sized * buffer. get_name() should return %0 on success, a negative error code * or error. @get_name will be called without @parent->i_rwsem held. * -- cgit v1.2.3 From fc76b5968a435894062ad4160c2e81c32cc4972e Mon Sep 17 00:00:00 2001 From: André Almeida Date: Mon, 12 Jan 2026 22:51:25 -0300 Subject: exportfs: Mark struct export_operations functions at kernel-doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding a `@` before the function names make then recognizable as kernel-docs, so they get correctly rendered in the documentation. Even if they are already marked with `@` in the short one-line summary, the kernel-docs will correctly favor the more detailed definition here. Signed-off-by: André Almeida Link: https://patch.msgid.link/20260112-tonyk-fs_uuid-v1-2-acc1889de772@igalia.com Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 599ea86363e1..bed370b9f906 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -203,7 +203,7 @@ struct handle_to_path_ctx { * See Documentation/filesystems/nfs/exporting.rst for details on how to use * this interface correctly. * - * encode_fh: + * @encode_fh: * @encode_fh should store in the file handle fragment @fh (using at most * @max_len bytes) information that can be used by @decode_fh to recover the * file referred to by the &struct dentry @de. If @flag has CONNECTABLE bit @@ -215,7 +215,7 @@ struct handle_to_path_ctx { * greater than @max_len*4 bytes). On error @max_len contains the minimum * size(in 4 byte unit) needed to encode the file handle. * - * fh_to_dentry: + * @fh_to_dentry: * @fh_to_dentry is given a &struct super_block (@sb) and a file handle * fragment (@fh, @fh_len). It should return a &struct dentry which refers * to the same file that the file handle fragment refers to. If it cannot, @@ -227,29 +227,29 @@ struct handle_to_path_ctx { * created with d_alloc_root. The caller can then find any other extant * dentries by following the d_alias links. * - * fh_to_parent: + * @fh_to_parent: * Same as @fh_to_dentry, except that it returns a pointer to the parent * dentry if it was encoded into the filehandle fragment by @encode_fh. * - * get_name: + * @get_name: * @get_name should find a name for the given @child in the given @parent * directory. The name should be stored in the @name (with the * understanding that it is already pointing to a %NAME_MAX + 1 sized * buffer. get_name() should return %0 on success, a negative error code * or error. @get_name will be called without @parent->i_rwsem held. * - * get_parent: + * @get_parent: * @get_parent should find the parent directory for the given @child which * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * - * permission: + * @permission: * Allow filesystems to specify a custom permission function. * - * open: + * @open: * Allow filesystems to specify a custom open function. * - * commit_metadata: + * @commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * * Locking rules: -- cgit v1.2.3 From 7a6f811e2c06d656996776771f0498df129a0cc2 Mon Sep 17 00:00:00 2001 From: André Almeida Date: Mon, 12 Jan 2026 22:51:26 -0300 Subject: exportfs: Complete kernel-doc for struct export_operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Write down the missing members definitions for struct export_operations, using as a reference the commit messages that created the members. Signed-off-by: André Almeida Link: https://patch.msgid.link/20260112-tonyk-fs_uuid-v1-3-acc1889de772@igalia.com Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index bed370b9f906..262e24d83313 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -201,7 +201,7 @@ struct handle_to_path_ctx { * @commit_metadata: commit metadata changes to stable storage * * See Documentation/filesystems/nfs/exporting.rst for details on how to use - * this interface correctly. + * this interface correctly and the definition of the flags. * * @encode_fh: * @encode_fh should store in the file handle fragment @fh (using at most @@ -252,6 +252,19 @@ struct handle_to_path_ctx { * @commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * + * @get_uuid: + * Get a filesystem unique signature exposed to clients. + * + * @map_blocks: + * Map and, if necessary, allocate blocks for a layout. + * + * @commit_blocks: + * Commit blocks in a layout once the client is done with them. + * + * @flags: + * Allows the filesystem to communicate to nfsd that it may want to do things + * differently when dealing with it. + * * Locking rules: * get_parent is called with child->d_inode->i_rwsem down * get_name is not (which is possibly inconsistent) -- cgit v1.2.3 From 6cbfdf89470ef3c2110f376a507d135e7a7a7378 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 15 Jan 2026 13:23:40 +0100 Subject: posix_acl: make posix_acl_to_xattr() alloc the buffer Without exception all caller do that. So move the allocation into the helper. This reduces boilerplate and removes unnecessary error checking. Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260115122341.556026-1-mszeredi@redhat.com Signed-off-by: Christian Brauner --- fs/9p/acl.c | 16 +++---------- fs/btrfs/acl.c | 10 +++------ fs/ceph/acl.c | 50 ++++++++++++++++++----------------------- fs/fuse/acl.c | 12 ++++------ fs/gfs2/acl.c | 13 +++-------- fs/jfs/acl.c | 9 ++------ fs/ntfs3/xattr.c | 6 +---- fs/orangefs/acl.c | 8 +------ fs/posix_acl.c | 21 ++++++++--------- include/linux/posix_acl_xattr.h | 5 +++-- 10 files changed, 53 insertions(+), 97 deletions(-) (limited to 'include/linux') diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 633da5e37299..ae7e7cf7523a 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -167,17 +167,11 @@ int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, if (retval) goto err_out; - size = posix_acl_xattr_size(acl->a_count); - - value = kzalloc(size, GFP_NOFS); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS); if (!value) { retval = -ENOMEM; goto err_out; } - - retval = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (retval < 0) - goto err_out; } /* @@ -257,13 +251,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) return 0; /* Set a setxattr request to server */ - size = posix_acl_xattr_size(acl->a_count); - buffer = kmalloc(size, GFP_KERNEL); + buffer = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); if (!buffer) return -ENOMEM; - retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - if (retval < 0) - goto err_free_out; + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; @@ -275,7 +266,6 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) BUG(); } retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0); -err_free_out: kfree(buffer); return retval; } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index c336e2ab7f8a..e55b686fe1ab 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -57,7 +57,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type) { - int ret, size = 0; + int ret; + size_t size = 0; const char *name; char AUTO_KFREE(value); @@ -77,20 +78,15 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, if (acl) { unsigned int nofs_flag; - size = posix_acl_xattr_size(acl->a_count); /* * We're holding a transaction handle, so use a NOFS memory * allocation context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!value) return -ENOMEM; - - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (ret < 0) - return ret; } if (trans) diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 1564eacc253d..85d3dd48b167 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -90,7 +90,8 @@ retry: int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { - int ret = 0, size = 0; + int ret = 0; + size_t size = 0; const char *name = NULL; char *value = NULL; struct iattr newattrs; @@ -126,16 +127,11 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, } if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS); if (!value) { ret = -ENOMEM; goto out; } - - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (ret < 0) - goto out_free; } if (new_mode != old_mode) { @@ -172,7 +168,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct posix_acl *acl, *default_acl; size_t val_size1 = 0, val_size2 = 0; struct ceph_pagelist *pagelist = NULL; - void *tmp_buf = NULL; + void *tmp_buf1 = NULL, *tmp_buf2 = NULL; int err; err = posix_acl_create(dir, mode, &default_acl, &acl); @@ -192,15 +188,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, if (!default_acl && !acl) return 0; - if (acl) - val_size1 = posix_acl_xattr_size(acl->a_count); - if (default_acl) - val_size2 = posix_acl_xattr_size(default_acl->a_count); - err = -ENOMEM; - tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); - if (!tmp_buf) - goto out_err; pagelist = ceph_pagelist_alloc(GFP_KERNEL); if (!pagelist) goto out_err; @@ -213,34 +201,39 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, if (acl) { size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS); + + err = -ENOMEM; + tmp_buf1 = posix_acl_to_xattr(&init_user_ns, acl, + &val_size1, GFP_KERNEL); + if (!tmp_buf1) + goto out_err; err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8); if (err) goto out_err; ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS, len); - err = posix_acl_to_xattr(&init_user_ns, acl, - tmp_buf, val_size1); - if (err < 0) - goto out_err; ceph_pagelist_encode_32(pagelist, val_size1); - ceph_pagelist_append(pagelist, tmp_buf, val_size1); + ceph_pagelist_append(pagelist, tmp_buf1, val_size1); } if (default_acl) { size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT); + + err = -ENOMEM; + tmp_buf2 = posix_acl_to_xattr(&init_user_ns, default_acl, + &val_size2, GFP_KERNEL); + if (!tmp_buf2) + goto out_err; err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); if (err) goto out_err; ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_DEFAULT, len); - err = posix_acl_to_xattr(&init_user_ns, default_acl, - tmp_buf, val_size2); - if (err < 0) - goto out_err; ceph_pagelist_encode_32(pagelist, val_size2); - ceph_pagelist_append(pagelist, tmp_buf, val_size2); + ceph_pagelist_append(pagelist, tmp_buf2, val_size2); } - kfree(tmp_buf); + kfree(tmp_buf1); + kfree(tmp_buf2); as_ctx->acl = acl; as_ctx->default_acl = default_acl; @@ -250,7 +243,8 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, out_err: posix_acl_release(acl); posix_acl_release(default_acl); - kfree(tmp_buf); + kfree(tmp_buf1); + kfree(tmp_buf2); if (pagelist) ceph_pagelist_release(pagelist); return err; diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 8f484b105f13..cbde6ac1add3 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -122,20 +122,16 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, * them to be refreshed the next time they are used, * and it also updates i_ctime. */ - size_t size = posix_acl_xattr_size(acl->a_count); + size_t size; void *value; - if (size > PAGE_SIZE) - return -E2BIG; - - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(fc->user_ns, acl, &size, GFP_KERNEL); if (!value) return -ENOMEM; - ret = posix_acl_to_xattr(fc->user_ns, acl, value, size); - if (ret < 0) { + if (size > PAGE_SIZE) { kfree(value); - return ret; + return -E2BIG; } /* diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 443640e6fb9c..a5b60778b91c 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -83,21 +83,14 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu) int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error; - size_t len; - char *data; + size_t len = 0; + char *data = NULL; const char *name = gfs2_acl_name(type); if (acl) { - len = posix_acl_xattr_size(acl->a_count); - data = kmalloc(len, GFP_NOFS); + data = posix_acl_to_xattr(&init_user_ns, acl, &len, GFP_NOFS); if (data == NULL) return -ENOMEM; - error = posix_acl_to_xattr(&init_user_ns, acl, data, len); - if (error < 0) - goto out; - } else { - data = NULL; - len = 0; } error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 1de3602c98de..16b71a23ff1e 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -61,7 +61,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, { char *ea_name; int rc; - int size = 0; + size_t size = 0; char *value = NULL; switch (type) { @@ -76,16 +76,11 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, } if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); if (!value) return -ENOMEM; - rc = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (rc < 0) - goto out; } rc = __jfs_setxattr(tid, inode, ea_name, value, size, 0); -out: kfree(value); if (!rc) diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index c93df55e98d0..37a69a75ce68 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -641,13 +641,9 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, value = NULL; flags = XATTR_REPLACE; } else { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS); if (!value) return -ENOMEM; - err = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (err < 0) - goto out; flags = 0; } diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 5aefb705bcc8..a01ef0c1b1bf 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -90,14 +90,9 @@ int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) type); if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); if (!value) return -ENOMEM; - - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (error < 0) - goto out; } gossip_debug(GOSSIP_ACL_DEBUG, @@ -111,7 +106,6 @@ int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) */ error = orangefs_inode_setxattr(inode, name, value, size, 0); -out: kfree(value); if (!error) set_cached_acl(inode, type, acl); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 768f027c1428..4ef6f9d2b8d6 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -829,19 +829,19 @@ EXPORT_SYMBOL (posix_acl_from_xattr); /* * Convert from in-memory to extended attribute representation. */ -int +void * posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, - void *buffer, size_t size) + size_t *sizep, gfp_t gfp) { - struct posix_acl_xattr_header *ext_acl = buffer; + struct posix_acl_xattr_header *ext_acl; struct posix_acl_xattr_entry *ext_entry; - int real_size, n; + size_t size; + int n; - real_size = posix_acl_xattr_size(acl->a_count); - if (!buffer) - return real_size; - if (real_size > size) - return -ERANGE; + size = posix_acl_xattr_size(acl->a_count); + ext_acl = kmalloc(size, gfp); + if (!ext_acl) + return NULL; ext_entry = (void *)(ext_acl + 1); ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); @@ -864,7 +864,8 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, break; } } - return real_size; + *sizep = size; + return ext_acl; } EXPORT_SYMBOL (posix_acl_to_xattr); diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index e86f3b731da2..9e1892525eac 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -44,8 +44,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, } #endif -int posix_acl_to_xattr(struct user_namespace *user_ns, - const struct posix_acl *acl, void *buffer, size_t size); +extern void *posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + size_t *sizep, gfp_t gfp); + static inline const char *posix_acl_xattr_name(int type) { switch (type) { -- cgit v1.2.3