summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-09-29 19:42:30 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2025-09-29 19:42:30 +0300
commit56e7b310717697109998966cb3c4d3e490d09200 (patch)
tree24075d8293ba26a84877205799d62218db2d8477 /include/linux
parent3a2a5b278fb8d4cdb3154b8e4a38352b945f96fd (diff)
parentc3c616c53dbabddf32a0485bd133d8d3b9f6656a (diff)
downloadlinux-56e7b310717697109998966cb3c4d3e490d09200.tar.xz
Merge tag 'vfs-6.18-rc1.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs inode updates from Christian Brauner: "This contains a series I originally wrote and that Eric brought over the finish line. It moves out the i_crypt_info and i_verity_info pointers out of 'struct inode' and into the fs-specific part of the inode. So now the few filesytems that actually make use of this pay the price in their own private inode storage instead of forcing it upon every user of struct inode. The pointer for the crypt and verity info is simply found by storing an offset to its address in struct fsverity_operations and struct fscrypt_operations. This shrinks struct inode by 16 bytes. I hope to move a lot more out of it in the future so that struct inode becomes really just about very core stuff that we need, much like struct dentry and struct file, instead of the dumping ground it has become over the years. On top of this are a various changes associated with the ongoing inode lifetime handling rework that multiple people are pushing forward: - Stop accessing inode->i_count directly in f2fs and gfs2. They simply should use the __iget() and iput() helpers - Make the i_state flags an enum - Rework the iput() logic Currently, if we are the last iput, and we have the I_DIRTY_TIME bit set, we will grab a reference on the inode again and then mark it dirty and then redo the put. This is to make sure we delay the time update for as long as possible We can rework this logic to simply dec i_count if it is not 1, and if it is do the time update while still holding the i_count reference Then we can replace the atomic_dec_and_lock with locking the ->i_lock and doing atomic_dec_and_test, since we did the atomic_add_unless above - Add an icount_read() helper and convert everyone that accesses inode->i_count directly for this purpose to use the helper - Expand dump_inode() to dump more information about an inode helping in debugging - Add some might_sleep() annotations to iput() and associated helpers" * tag 'vfs-6.18-rc1.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: fs: add might_sleep() annotation to iput() and more fs: expand dump_inode() inode: fix whitespace issues fs: add an icount_read helper fs: rework iput logic fs: make the i_state flags an enum fs: stop accessing ->i_count directly in f2fs and gfs2 fsverity: check IS_VERITY() in fsverity_cleanup_inode() fs: remove inode::i_verity_info btrfs: move verity info pointer to fs-specific part of inode f2fs: move verity info pointer to fs-specific part of inode ext4: move verity info pointer to fs-specific part of inode fsverity: add support for info in fs-specific part of inode fs: remove inode::i_crypt_info ceph: move crypt info pointer to fs-specific part of inode ubifs: move crypt info pointer to fs-specific part of inode f2fs: move crypt info pointer to fs-specific part of inode ext4: move crypt info pointer to fs-specific part of inode fscrypt: add support for info in fs-specific part of inode fscrypt: replace raw loads of info pointer with helper function
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/fs.h246
-rw-r--r--include/linux/fscrypt.h40
-rw-r--r--include/linux/fsverity.h57
3 files changed, 208 insertions, 135 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5265b76759f1..a854f2910cd9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -72,9 +72,7 @@ struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
-struct fscrypt_inode_info;
struct fscrypt_operations;
-struct fsverity_info;
struct fsverity_operations;
struct fsnotify_mark_connector;
struct fsnotify_sb_info;
@@ -669,6 +667,124 @@ is_uncached_acl(struct posix_acl *acl)
#define IOP_CACHED_LINK 0x0040
/*
+ * Inode state bits. Protected by inode->i_lock
+ *
+ * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
+ * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
+ *
+ * Four bits define the lifetime of an inode. Initially, inodes are I_NEW,
+ * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at
+ * various stages of removing an inode.
+ *
+ * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
+ *
+ * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on
+ * fdatasync() (unless I_DIRTY_DATASYNC is also set).
+ * Timestamp updates are the usual cause.
+ * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of
+ * these changes separately from I_DIRTY_SYNC so that we
+ * don't have to write inode on fdatasync() when only
+ * e.g. the timestamps have changed.
+ * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean.
+ * I_DIRTY_TIME The inode itself has dirty timestamps, and the
+ * lazytime mount option is enabled. We keep track of this
+ * separately from I_DIRTY_SYNC in order to implement
+ * lazytime. This gets cleared if I_DIRTY_INODE
+ * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But
+ * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already
+ * in place because writeback might already be in progress
+ * and we don't want to lose the time update
+ * I_NEW Serves as both a mutex and completion notification.
+ * New inodes set I_NEW. If two processes both create
+ * the same inode, one of them will release its inode and
+ * wait for I_NEW to be released before returning.
+ * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
+ * also cause waiting on I_NEW, without I_NEW actually
+ * being set. find_inode() uses this to prevent returning
+ * nearly-dead inodes.
+ * I_WILL_FREE Must be set when calling write_inode_now() if i_count
+ * is zero. I_FREEING must be set when I_WILL_FREE is
+ * cleared.
+ * I_FREEING Set when inode is about to be freed but still has dirty
+ * pages or buffers attached or the inode itself is still
+ * dirty.
+ * I_CLEAR Added by clear_inode(). In this state the inode is
+ * clean and can be destroyed. Inode keeps I_FREEING.
+ *
+ * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
+ * prohibited for many purposes. iget() must wait for
+ * the inode to be completely released, then create it
+ * anew. Other functions will just ignore such inodes,
+ * if appropriate. I_NEW is used for waiting.
+ *
+ * I_SYNC Writeback of inode is running. The bit is set during
+ * data writeback, and cleared with a wakeup on the bit
+ * address once it is done. The bit is also used to pin
+ * the inode in memory for flusher thread.
+ *
+ * I_REFERENCED Marks the inode as recently references on the LRU list.
+ *
+ * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
+ * synchronize competing switching instances and to tell
+ * wb stat updates to grab the i_pages lock. See
+ * inode_switch_wbs_work_fn() for details.
+ *
+ * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper
+ * and work dirs among overlayfs mounts.
+ *
+ * I_CREATING New object's inode in the middle of setting up.
+ *
+ * I_DONTCACHE Evict inode as soon as it is not used anymore.
+ *
+ * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists.
+ * Used to detect that mark_inode_dirty() should not move
+ * inode between dirty lists.
+ *
+ * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback.
+ *
+ * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding
+ * i_count.
+ *
+ * Q: What is the difference between I_WILL_FREE and I_FREEING?
+ *
+ * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
+ * upon. There's one free address left.
+ */
+
+enum inode_state_bits {
+ __I_NEW = 0U,
+ __I_SYNC = 1U,
+ __I_LRU_ISOLATING = 2U
+ /* reserved wait address bit 3 */
+};
+
+enum inode_state_flags_t {
+ I_NEW = (1U << __I_NEW),
+ I_SYNC = (1U << __I_SYNC),
+ I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING),
+ /* reserved flag bit 3 */
+ I_DIRTY_SYNC = (1U << 4),
+ I_DIRTY_DATASYNC = (1U << 5),
+ I_DIRTY_PAGES = (1U << 6),
+ I_WILL_FREE = (1U << 7),
+ I_FREEING = (1U << 8),
+ I_CLEAR = (1U << 9),
+ I_REFERENCED = (1U << 10),
+ I_LINKABLE = (1U << 11),
+ I_DIRTY_TIME = (1U << 12),
+ I_WB_SWITCH = (1U << 13),
+ I_OVL_INUSE = (1U << 14),
+ I_CREATING = (1U << 15),
+ I_DONTCACHE = (1U << 16),
+ I_SYNC_QUEUED = (1U << 17),
+ I_PINNING_NETFS_WB = (1U << 18)
+};
+
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
+#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
+#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
+
+/*
* Keep mostly read-only and often accessed (especially for
* the RCU path lookup and 'stat' data) fields at the beginning
* of the 'struct inode'
@@ -726,7 +842,7 @@ struct inode {
#endif
/* Misc */
- u32 i_state;
+ enum inode_state_flags_t i_state;
/* 32-bit hole */
struct rw_semaphore i_rwsem;
@@ -782,14 +898,6 @@ struct inode {
struct fsnotify_mark_connector __rcu *i_fsnotify_marks;
#endif
-#ifdef CONFIG_FS_ENCRYPTION
- struct fscrypt_inode_info *i_crypt_info;
-#endif
-
-#ifdef CONFIG_FS_VERITY
- struct fsverity_info *i_verity_info;
-#endif
-
void *i_private; /* fs or device private pointer */
} __randomize_layout;
@@ -2492,117 +2600,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
};
}
-/*
- * Inode state bits. Protected by inode->i_lock
- *
- * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
- * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
- *
- * Four bits define the lifetime of an inode. Initially, inodes are I_NEW,
- * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at
- * various stages of removing an inode.
- *
- * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
- *
- * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on
- * fdatasync() (unless I_DIRTY_DATASYNC is also set).
- * Timestamp updates are the usual cause.
- * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of
- * these changes separately from I_DIRTY_SYNC so that we
- * don't have to write inode on fdatasync() when only
- * e.g. the timestamps have changed.
- * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean.
- * I_DIRTY_TIME The inode itself has dirty timestamps, and the
- * lazytime mount option is enabled. We keep track of this
- * separately from I_DIRTY_SYNC in order to implement
- * lazytime. This gets cleared if I_DIRTY_INODE
- * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But
- * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already
- * in place because writeback might already be in progress
- * and we don't want to lose the time update
- * I_NEW Serves as both a mutex and completion notification.
- * New inodes set I_NEW. If two processes both create
- * the same inode, one of them will release its inode and
- * wait for I_NEW to be released before returning.
- * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
- * also cause waiting on I_NEW, without I_NEW actually
- * being set. find_inode() uses this to prevent returning
- * nearly-dead inodes.
- * I_WILL_FREE Must be set when calling write_inode_now() if i_count
- * is zero. I_FREEING must be set when I_WILL_FREE is
- * cleared.
- * I_FREEING Set when inode is about to be freed but still has dirty
- * pages or buffers attached or the inode itself is still
- * dirty.
- * I_CLEAR Added by clear_inode(). In this state the inode is
- * clean and can be destroyed. Inode keeps I_FREEING.
- *
- * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
- * prohibited for many purposes. iget() must wait for
- * the inode to be completely released, then create it
- * anew. Other functions will just ignore such inodes,
- * if appropriate. I_NEW is used for waiting.
- *
- * I_SYNC Writeback of inode is running. The bit is set during
- * data writeback, and cleared with a wakeup on the bit
- * address once it is done. The bit is also used to pin
- * the inode in memory for flusher thread.
- *
- * I_REFERENCED Marks the inode as recently references on the LRU list.
- *
- * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
- * synchronize competing switching instances and to tell
- * wb stat updates to grab the i_pages lock. See
- * inode_switch_wbs_work_fn() for details.
- *
- * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper
- * and work dirs among overlayfs mounts.
- *
- * I_CREATING New object's inode in the middle of setting up.
- *
- * I_DONTCACHE Evict inode as soon as it is not used anymore.
- *
- * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists.
- * Used to detect that mark_inode_dirty() should not move
- * inode between dirty lists.
- *
- * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback.
- *
- * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding
- * i_count.
- *
- * Q: What is the difference between I_WILL_FREE and I_FREEING?
- *
- * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
- * upon. There's one free address left.
- */
-#define __I_NEW 0
-#define I_NEW (1 << __I_NEW)
-#define __I_SYNC 1
-#define I_SYNC (1 << __I_SYNC)
-#define __I_LRU_ISOLATING 2
-#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING)
-
-#define I_DIRTY_SYNC (1 << 3)
-#define I_DIRTY_DATASYNC (1 << 4)
-#define I_DIRTY_PAGES (1 << 5)
-#define I_WILL_FREE (1 << 6)
-#define I_FREEING (1 << 7)
-#define I_CLEAR (1 << 8)
-#define I_REFERENCED (1 << 9)
-#define I_LINKABLE (1 << 10)
-#define I_DIRTY_TIME (1 << 11)
-#define I_WB_SWITCH (1 << 12)
-#define I_OVL_INUSE (1 << 13)
-#define I_CREATING (1 << 14)
-#define I_DONTCACHE (1 << 15)
-#define I_SYNC_QUEUED (1 << 16)
-#define I_PINNING_NETFS_WB (1 << 17)
-
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
-#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
-#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
-
extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
{
@@ -2614,6 +2611,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
__mark_inode_dirty(inode, I_DIRTY_SYNC);
}
+static inline int icount_read(const struct inode *inode)
+{
+ return atomic_read(&inode->i_count);
+}
+
/*
* Returns true if the given inode itself only has dirty timestamps (its pages
* may still be dirty) and isn't currently being allocated or freed.
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 10dd161690a2..516aba5b858b 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -61,6 +61,12 @@ struct fscrypt_name {
/* Crypto operations for filesystems */
struct fscrypt_operations {
+ /*
+ * The offset of the pointer to struct fscrypt_inode_info in the
+ * filesystem-specific part of the inode, relative to the beginning of
+ * the common part of the inode (the 'struct inode').
+ */
+ ptrdiff_t inode_info_offs;
/*
* If set, then fs/crypto/ will allocate a global bounce page pool the
@@ -195,16 +201,44 @@ struct fscrypt_operations {
int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name,
struct dentry *dentry, unsigned int flags);
+/*
+ * Returns the address of the fscrypt info pointer within the
+ * filesystem-specific part of the inode. (To save memory on filesystems that
+ * don't support fscrypt, a field in 'struct inode' itself is no longer used.)
+ */
+static inline struct fscrypt_inode_info **
+fscrypt_inode_info_addr(const struct inode *inode)
+{
+ VFS_WARN_ON_ONCE(inode->i_sb->s_cop->inode_info_offs == 0);
+ return (void *)inode + inode->i_sb->s_cop->inode_info_offs;
+}
+
+/*
+ * Load the inode's fscrypt info pointer, using a raw dereference. Since this
+ * uses a raw dereference with no memory barrier, it is appropriate to use only
+ * when the caller knows the inode's key setup already happened, resulting in
+ * non-NULL fscrypt info. E.g., the file contents en/decryption functions use
+ * this, since fscrypt_file_open() set up the key.
+ */
+static inline struct fscrypt_inode_info *
+fscrypt_get_inode_info_raw(const struct inode *inode)
+{
+ struct fscrypt_inode_info *ci = *fscrypt_inode_info_addr(inode);
+
+ VFS_WARN_ON_ONCE(ci == NULL);
+ return ci;
+}
+
static inline struct fscrypt_inode_info *
fscrypt_get_inode_info(const struct inode *inode)
{
/*
* Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info().
- * I.e., another task may publish ->i_crypt_info concurrently, executing
- * a RELEASE barrier. We need to use smp_load_acquire() here to safely
+ * I.e., another task may publish the fscrypt info concurrently,
+ * executing a RELEASE barrier. Use smp_load_acquire() here to safely
* ACQUIRE the memory the other task published.
*/
- return smp_load_acquire(&inode->i_crypt_info);
+ return smp_load_acquire(fscrypt_inode_info_addr(inode));
}
/**
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 1eb7eae580be..5bc7280425a7 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -26,8 +26,16 @@
/* Arbitrary limit to bound the kmalloc() size. Can be changed. */
#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384
+struct fsverity_info;
+
/* Verity operations for filesystems */
struct fsverity_operations {
+ /**
+ * The offset of the pointer to struct fsverity_info in the
+ * filesystem-specific part of the inode, relative to the beginning of
+ * the common part of the inode (the 'struct inode').
+ */
+ ptrdiff_t inode_info_offs;
/**
* Begin enabling verity on the given file.
@@ -124,15 +132,37 @@ struct fsverity_operations {
#ifdef CONFIG_FS_VERITY
+/*
+ * Returns the address of the verity info pointer within the filesystem-specific
+ * part of the inode. (To save memory on filesystems that don't support
+ * fsverity, a field in 'struct inode' itself is no longer used.)
+ */
+static inline struct fsverity_info **
+fsverity_info_addr(const struct inode *inode)
+{
+ VFS_WARN_ON_ONCE(inode->i_sb->s_vop->inode_info_offs == 0);
+ return (void *)inode + inode->i_sb->s_vop->inode_info_offs;
+}
+
static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
{
/*
- * Pairs with the cmpxchg_release() in fsverity_set_info().
- * I.e., another task may publish ->i_verity_info concurrently,
- * executing a RELEASE barrier. We need to use smp_load_acquire() here
- * to safely ACQUIRE the memory the other task published.
+ * Since this function can be called on inodes belonging to filesystems
+ * that don't support fsverity at all, and fsverity_info_addr() doesn't
+ * work on such filesystems, we have to start with an IS_VERITY() check.
+ * Checking IS_VERITY() here is also useful to minimize the overhead of
+ * fsverity_active() on non-verity files.
+ */
+ if (!IS_VERITY(inode))
+ return NULL;
+
+ /*
+ * Pairs with the cmpxchg_release() in fsverity_set_info(). I.e.,
+ * another task may publish the inode's verity info concurrently,
+ * executing a RELEASE barrier. Use smp_load_acquire() here to safely
+ * ACQUIRE the memory the other task published.
*/
- return smp_load_acquire(&inode->i_verity_info);
+ return smp_load_acquire(fsverity_info_addr(inode));
}
/* enable.c */
@@ -156,12 +186,19 @@ void __fsverity_cleanup_inode(struct inode *inode);
* fsverity_cleanup_inode() - free the inode's verity info, if present
* @inode: an inode being evicted
*
- * Filesystems must call this on inode eviction to free ->i_verity_info.
+ * Filesystems must call this on inode eviction to free the inode's verity info.
*/
static inline void fsverity_cleanup_inode(struct inode *inode)
{
- if (inode->i_verity_info)
+ /*
+ * Only IS_VERITY() inodes can have verity info, so start by checking
+ * for IS_VERITY() (which is faster than retrieving the pointer to the
+ * verity info). This minimizes overhead for non-verity inodes.
+ */
+ if (IS_VERITY(inode))
__fsverity_cleanup_inode(inode);
+ else
+ VFS_WARN_ON_ONCE(*fsverity_info_addr(inode) != NULL);
}
/* read_metadata.c */
@@ -267,12 +304,12 @@ static inline bool fsverity_verify_page(struct page *page)
* fsverity_active() - do reads from the inode need to go through fs-verity?
* @inode: inode to check
*
- * This checks whether ->i_verity_info has been set.
+ * This checks whether the inode's verity info has been set.
*
* Filesystems call this from ->readahead() to check whether the pages need to
* be verified or not. Don't use IS_VERITY() for this purpose; it's subject to
* a race condition where the file is being read concurrently with
- * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.)
+ * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before the verity info.)
*
* Return: true if reads need to go through fs-verity, otherwise false
*/
@@ -287,7 +324,7 @@ static inline bool fsverity_active(const struct inode *inode)
* @filp: the struct file being set up
*
* When opening a verity file, deny the open if it is for writing. Otherwise,
- * set up the inode's ->i_verity_info if not already done.
+ * set up the inode's verity info if not already done.
*
* When combined with fscrypt, this must be called after fscrypt_file_open().
* Otherwise, we won't have the key set up to decrypt the verity metadata.