summaryrefslogtreecommitdiff
path: root/include/linux/fs.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/fs.h')
-rw-r--r--include/linux/fs.h746
1 files changed, 506 insertions, 240 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c3d86532e3f..a6de8d93838d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -43,6 +43,8 @@
#include <linux/cred.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
+#include <linux/maple_tree.h>
+#include <linux/rw_hint.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -67,10 +69,12 @@ struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
-struct fscrypt_info;
+struct fscrypt_inode_info;
struct fscrypt_operations;
struct fsverity_info;
struct fsverity_operations;
+struct fsnotify_mark_connector;
+struct fsnotify_sb_info;
struct fs_context;
struct fs_parameter_spec;
struct fileattr;
@@ -108,21 +112,28 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
*/
/* file is open for reading */
-#define FMODE_READ ((__force fmode_t)0x1)
+#define FMODE_READ ((__force fmode_t)(1 << 0))
/* file is open for writing */
-#define FMODE_WRITE ((__force fmode_t)0x2)
+#define FMODE_WRITE ((__force fmode_t)(1 << 1))
/* file is seekable */
-#define FMODE_LSEEK ((__force fmode_t)0x4)
+#define FMODE_LSEEK ((__force fmode_t)(1 << 2))
/* file can be accessed using pread */
-#define FMODE_PREAD ((__force fmode_t)0x8)
+#define FMODE_PREAD ((__force fmode_t)(1 << 3))
/* file can be accessed using pwrite */
-#define FMODE_PWRITE ((__force fmode_t)0x10)
+#define FMODE_PWRITE ((__force fmode_t)(1 << 4))
/* File is opened for execution with sys_execve / sys_uselib */
-#define FMODE_EXEC ((__force fmode_t)0x20)
+#define FMODE_EXEC ((__force fmode_t)(1 << 5))
+/* File writes are restricted (block device specific) */
+#define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6))
+/* File supports atomic writes */
+#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7))
+
+/* FMODE_* bit 8 */
+
/* 32bit hashes as llseek() offset (for directories) */
-#define FMODE_32BITHASH ((__force fmode_t)0x200)
+#define FMODE_32BITHASH ((__force fmode_t)(1 << 9))
/* 64bit hashes as llseek() offset (for directories) */
-#define FMODE_64BITHASH ((__force fmode_t)0x400)
+#define FMODE_64BITHASH ((__force fmode_t)(1 << 10))
/*
* Don't update ctime and mtime.
@@ -130,60 +141,52 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
* Currently a special hack for the XFS open_by_handle ioctl, but we'll
* hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
*/
-#define FMODE_NOCMTIME ((__force fmode_t)0x800)
+#define FMODE_NOCMTIME ((__force fmode_t)(1 << 11))
/* Expect random access pattern */
-#define FMODE_RANDOM ((__force fmode_t)0x1000)
+#define FMODE_RANDOM ((__force fmode_t)(1 << 12))
-/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
-#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
+/* FMODE_* bit 13 */
/* File is opened with O_PATH; almost nothing can be done with it */
-#define FMODE_PATH ((__force fmode_t)0x4000)
+#define FMODE_PATH ((__force fmode_t)(1 << 14))
/* File needs atomic accesses to f_pos */
-#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000)
+#define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15))
/* Write access to underlying fs */
-#define FMODE_WRITER ((__force fmode_t)0x10000)
+#define FMODE_WRITER ((__force fmode_t)(1 << 16))
/* Has read method(s) */
-#define FMODE_CAN_READ ((__force fmode_t)0x20000)
+#define FMODE_CAN_READ ((__force fmode_t)(1 << 17))
/* Has write method(s) */
-#define FMODE_CAN_WRITE ((__force fmode_t)0x40000)
+#define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18))
-#define FMODE_OPENED ((__force fmode_t)0x80000)
-#define FMODE_CREATED ((__force fmode_t)0x100000)
+#define FMODE_OPENED ((__force fmode_t)(1 << 19))
+#define FMODE_CREATED ((__force fmode_t)(1 << 20))
/* File is stream-like */
-#define FMODE_STREAM ((__force fmode_t)0x200000)
+#define FMODE_STREAM ((__force fmode_t)(1 << 21))
/* File supports DIRECT IO */
-#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000)
+#define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22))
-#define FMODE_NOREUSE ((__force fmode_t)0x800000)
+#define FMODE_NOREUSE ((__force fmode_t)(1 << 23))
-/* File supports non-exclusive O_DIRECT writes from multiple threads */
-#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000)
+/* FMODE_* bit 24 */
/* File is embedded in backing_file object */
-#define FMODE_BACKING ((__force fmode_t)0x2000000)
+#define FMODE_BACKING ((__force fmode_t)(1 << 25))
/* File was opened by fanotify and shouldn't generate fanotify events */
-#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
+#define FMODE_NONOTIFY ((__force fmode_t)(1 << 26))
/* File is capable of returning -EAGAIN if I/O will block */
-#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
+#define FMODE_NOWAIT ((__force fmode_t)(1 << 27))
/* File represents mount that needs unmounting */
-#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000)
+#define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28))
/* File does not contribute to nr_files count */
-#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000)
-
-/* File supports async buffered reads */
-#define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000)
-
-/* File supports async nowait buffered writes */
-#define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000)
+#define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29))
/*
* Attribute flags. These should be or-ed together to figure out what
@@ -206,6 +209,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */
#define ATTR_TIMES_SET (1 << 16)
#define ATTR_TOUCH (1 << 17)
+#define ATTR_DELEG (1 << 18) /* Delegated attrs. Don't break write delegations */
/*
* Whiteout is represented by a char device. The following constants define the
@@ -309,25 +313,13 @@ struct address_space;
struct writeback_control;
struct readahead_control;
-/*
- * Write life time hint values.
- * Stored in struct inode as u8.
- */
-enum rw_hint {
- WRITE_LIFE_NOT_SET = 0,
- WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
- WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
- WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
- WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
- WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
-};
-
/* Match RWF_* bits to IOCB bits */
#define IOCB_HIPRI (__force int) RWF_HIPRI
#define IOCB_DSYNC (__force int) RWF_DSYNC
#define IOCB_SYNC (__force int) RWF_SYNC
#define IOCB_NOWAIT (__force int) RWF_NOWAIT
#define IOCB_APPEND (__force int) RWF_APPEND
+#define IOCB_ATOMIC (__force int) RWF_ATOMIC
/* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD (1 << 16)
@@ -362,6 +354,7 @@ enum rw_hint {
{ IOCB_SYNC, "SYNC" }, \
{ IOCB_NOWAIT, "NOWAIT" }, \
{ IOCB_APPEND, "APPEND" }, \
+ { IOCB_ATOMIC, "ATOMIC"}, \
{ IOCB_EVENTFD, "EVENTFD"}, \
{ IOCB_DIRECT, "DIRECT" }, \
{ IOCB_WRITE, "WRITE" }, \
@@ -415,10 +408,10 @@ struct address_space_operations {
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata);
+ struct folio **foliop, void **fsdata);
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata);
+ struct folio *folio, void *fsdata);
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
@@ -436,7 +429,7 @@ struct address_space_operations {
bool (*is_partially_uptodate) (struct folio *, size_t from,
size_t count);
void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb);
- int (*error_remove_page)(struct address_space *, struct page *);
+ int (*error_remove_folio)(struct address_space *, struct folio *);
/* swapfile support */
int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
@@ -456,7 +449,7 @@ extern const struct address_space_operations empty_aops;
* It is also used to block modification of page cache contents through
* memory mappings.
* @gfp_mask: Memory allocation flags to use for allocating pages.
- * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
* @nr_thps: Number of THPs in the pagecache (non-shmem only).
* @i_mmap: Tree of private and shared mappings.
* @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
@@ -465,9 +458,9 @@ extern const struct address_space_operations empty_aops;
* @a_ops: Methods.
* @flags: Error bits and flags (AS_*).
* @wb_err: The most recent error which has occurred.
- * @private_lock: For use by the owner of the address_space.
- * @private_list: For use by the owner of the address_space.
- * @private_data: For use by the owner of the address_space.
+ * @i_private_lock: For use by the owner of the address_space.
+ * @i_private_list: For use by the owner of the address_space.
+ * @i_private_data: For use by the owner of the address_space.
*/
struct address_space {
struct inode *host;
@@ -484,11 +477,11 @@ struct address_space {
pgoff_t writeback_index;
const struct address_space_operations *a_ops;
unsigned long flags;
- struct rw_semaphore i_mmap_rwsem;
errseq_t wb_err;
- spinlock_t private_lock;
- struct list_head private_list;
- void *private_data;
+ spinlock_t i_private_lock;
+ struct list_head i_private_list;
+ struct rw_semaphore i_mmap_rwsem;
+ void * i_private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
/*
* On most architectures that alignment is already the case; but
@@ -559,7 +552,7 @@ static inline int mapping_mapped(struct address_space *mapping)
/*
* Might pages of this file have been modified in userspace?
- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
+ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
* marks vma as VM_SHARED if it is shared, and the file was opened for
* writing i.e. vma may be mprotected writable even if now readonly.
*
@@ -631,8 +624,6 @@ is_uncached_acl(struct posix_acl *acl)
#define IOP_XATTR 0x0008
#define IOP_DEFAULT_READLINK 0x0010
-struct fsnotify_mark_connector;
-
/*
* Keep mostly read-only and often accessed (especially for
* the RCU path lookup and 'stat' data) fields at the beginning
@@ -642,7 +633,6 @@ struct inode {
umode_t i_mode;
unsigned short i_opflags;
kuid_t i_uid;
- struct list_head i_lru; /* inode LRU list */
kgid_t i_gid;
unsigned int i_flags;
@@ -674,13 +664,17 @@ struct inode {
};
dev_t i_rdev;
loff_t i_size;
- struct timespec64 i_atime;
- struct timespec64 i_mtime;
- struct timespec64 __i_ctime; /* use inode_*_ctime accessors! */
+ time64_t i_atime_sec;
+ time64_t i_mtime_sec;
+ time64_t i_ctime_sec;
+ u32 i_atime_nsec;
+ u32 i_mtime_nsec;
+ u32 i_ctime_nsec;
+ u32 i_generation;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
u8 i_blkbits;
- u8 i_write_hint;
+ enum rw_hint i_write_hint;
blkcnt_t i_blocks;
#ifdef __NEED_I_SIZE_ORDERED
@@ -688,7 +682,8 @@ struct inode {
#endif
/* Misc */
- unsigned long i_state;
+ u32 i_state;
+ /* 32-bit hole */
struct rw_semaphore i_rwsem;
unsigned long dirtied_when; /* jiffies of first dirtying */
@@ -704,6 +699,7 @@ struct inode {
u16 i_wb_frn_avg_time;
u16 i_wb_frn_history;
#endif
+ struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
struct list_head i_wb_list; /* backing dev writeback list */
union {
@@ -732,15 +728,15 @@ struct inode {
unsigned i_dir_seq;
};
- __u32 i_generation;
#ifdef CONFIG_FSNOTIFY
__u32 i_fsnotify_mask; /* all events this inode cares about */
+ /* 32-bit hole reserved for expanding i_fsnotify_mask */
struct fsnotify_mark_connector __rcu *i_fsnotify_marks;
#endif
#ifdef CONFIG_FS_ENCRYPTION
- struct fscrypt_info *i_crypt_info;
+ struct fscrypt_inode_info *i_crypt_info;
#endif
#ifdef CONFIG_FS_VERITY
@@ -750,6 +746,21 @@ struct inode {
void *i_private; /* fs or device private pointer */
} __randomize_layout;
+/*
+ * Get bit address from inode->i_state to use with wait_var_event()
+ * infrastructre.
+ */
+#define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit))
+
+struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
+ struct inode *inode, u32 bit);
+
+static inline void inode_wake_up_bit(struct inode *inode, u32 bit)
+{
+ /* Caller is responsible for correct memory barriers. */
+ wake_up_var(inode_state_wait_address(inode, bit));
+}
+
struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);
static inline unsigned int i_blocksize(const struct inode *node)
@@ -909,7 +920,8 @@ static inline loff_t i_size_read(const struct inode *inode)
preempt_enable();
return i_size;
#else
- return inode->i_size;
+ /* Pairs with smp_store_release() in i_size_write() */
+ return smp_load_acquire(&inode->i_size);
#endif
}
@@ -931,7 +943,12 @@ static inline void i_size_write(struct inode *inode, loff_t i_size)
inode->i_size = i_size;
preempt_enable();
#else
- inode->i_size = i_size;
+ /*
+ * Pairs with smp_load_acquire() in i_size_read() to ensure
+ * changes related to inode size (such as page contents) are
+ * visible before we see the changed inode size.
+ */
+ smp_store_release(&inode->i_size, i_size);
#endif
}
@@ -946,6 +963,7 @@ static inline unsigned imajor(const struct inode *inode)
}
struct fown_struct {
+ struct file *file; /* backpointer for security modules */
rwlock_t lock; /* protects pid, uid, euid fields */
struct pid *pid; /* pid or -pgrp where SIGIO should be sent */
enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
@@ -985,50 +1003,69 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
index < ra->start + ra->size);
}
-/*
- * f_{lock,count,pos_lock} members can be highly contended and share
- * the same cacheline. f_{lock,mode} are very frequently used together
- * and so share the same cacheline as well. The read-mostly
- * f_{path,inode,op} are kept on a separate cacheline.
+/**
+ * struct file - Represents a file
+ * @f_count: reference count
+ * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
+ * @f_mode: FMODE_* flags often used in hotpaths
+ * @f_op: file operations
+ * @f_mapping: Contents of a cacheable, mappable object.
+ * @private_data: filesystem or driver specific data
+ * @f_inode: cached inode
+ * @f_flags: file flags
+ * @f_iocb_flags: iocb flags
+ * @f_cred: stashed credentials of creator/opener
+ * @f_path: path of the file
+ * @f_pos_lock: lock protecting file position
+ * @f_pipe: specific to pipes
+ * @f_pos: file position
+ * @f_security: LSM security context of this file
+ * @f_owner: file owner
+ * @f_wb_err: writeback error
+ * @f_sb_err: per sb writeback errors
+ * @f_ep: link of all epoll hooks for this file
+ * @f_task_work: task work entry point
+ * @f_llist: work queue entrypoint
+ * @f_ra: file's readahead state
+ * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
*/
struct file {
+ atomic_long_t f_count;
+ spinlock_t f_lock;
+ fmode_t f_mode;
+ const struct file_operations *f_op;
+ struct address_space *f_mapping;
+ void *private_data;
+ struct inode *f_inode;
+ unsigned int f_flags;
+ unsigned int f_iocb_flags;
+ const struct cred *f_cred;
+ /* --- cacheline 1 boundary (64 bytes) --- */
+ struct path f_path;
union {
- struct llist_node f_llist;
- struct rcu_head f_rcuhead;
- unsigned int f_iocb_flags;
+ /* regular files (with FMODE_ATOMIC_POS) and directories */
+ struct mutex f_pos_lock;
+ /* pipes */
+ u64 f_pipe;
};
-
- /*
- * Protects f_ep, f_flags.
- * Must not be taken from IRQ context.
- */
- spinlock_t f_lock;
- fmode_t f_mode;
- atomic_long_t f_count;
- struct mutex f_pos_lock;
- loff_t f_pos;
- unsigned int f_flags;
- struct fown_struct f_owner;
- const struct cred *f_cred;
- struct file_ra_state f_ra;
- struct path f_path;
- struct inode *f_inode; /* cached value */
- const struct file_operations *f_op;
-
- u64 f_version;
+ loff_t f_pos;
#ifdef CONFIG_SECURITY
- void *f_security;
+ void *f_security;
#endif
- /* needed for tty driver, and maybe others */
- void *private_data;
-
+ /* --- cacheline 2 boundary (128 bytes) --- */
+ struct fown_struct *f_owner;
+ errseq_t f_wb_err;
+ errseq_t f_sb_err;
#ifdef CONFIG_EPOLL
- /* Used by fs/eventpoll.c to link all the hooks to this file */
- struct hlist_head *f_ep;
-#endif /* #ifdef CONFIG_EPOLL */
- struct address_space *f_mapping;
- errseq_t f_wb_err;
- errseq_t f_sb_err; /* for syncfs */
+ struct hlist_head *f_ep;
+#endif
+ union {
+ struct callback_head f_task_work;
+ struct llist_node f_llist;
+ struct file_ra_state f_ra;
+ freeptr_t f_freeptr;
+ };
+ /* --- cacheline 3 boundary (192 bytes) --- */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
@@ -1041,10 +1078,14 @@ struct file_handle {
static inline struct file *get_file(struct file *f)
{
- atomic_long_inc(&f->f_count);
+ long prior = atomic_long_fetch_inc_relaxed(&f->f_count);
+ WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n");
return f;
}
-#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
+
+struct file *get_file_rcu(struct file __rcu **f);
+struct file *get_file_active(struct file **f);
+
#define file_count(x) atomic_long_read(&(x)->f_count)
#define MAX_NON_LFS ((1UL<<31) - 1)
@@ -1061,6 +1102,7 @@ static inline struct file *get_file(struct file *f)
typedef void *fl_owner_t;
struct file_lock;
+struct file_lease;
/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
@@ -1068,6 +1110,12 @@ struct file_lock;
#define OFFT_OFFSET_MAX type_max(off_t)
#endif
+int file_f_owner_allocate(struct file *file);
+static inline struct fown_struct *file_f_owner(const struct file *file)
+{
+ return READ_ONCE(file->f_owner);
+}
+
extern void send_sigio(struct fown_struct *fown, int fd, int band);
static inline struct inode *file_inode(const struct file *f)
@@ -1075,9 +1123,20 @@ static inline struct inode *file_inode(const struct file *f)
return f->f_inode;
}
+/*
+ * file_dentry() is a relic from the days that overlayfs was using files with a
+ * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs.
+ * In those days, file_dentry() was needed to get the underlying fs dentry that
+ * matches f_inode.
+ * Files with "fake" path should not exist nowadays, so use an assertion to make
+ * sure that file_dentry() was not papering over filesystem bugs.
+ */
static inline struct dentry *file_dentry(const struct file *file)
{
- return d_real(file->f_path.dentry, file_inode(file));
+ struct dentry *dentry = file->f_path.dentry;
+
+ WARN_ON_ONCE(d_inode(dentry) != file_inode(file));
+ return dentry;
}
struct fasync_struct {
@@ -1105,7 +1164,7 @@ extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force
extern int f_setown(struct file *filp, int who, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
-extern int send_sigurg(struct fown_struct *fown);
+extern int send_sigurg(struct file *file);
/*
* sb->s_flags. Note that these mirror the equivalent MS_* flags where
@@ -1121,7 +1180,7 @@ extern int send_sigurg(struct fown_struct *fown);
#define SB_NOATIME BIT(10) /* Do not update access times. */
#define SB_NODIRATIME BIT(11) /* Do not update directory access times */
#define SB_SILENT BIT(15)
-#define SB_POSIXACL BIT(16) /* VFS does not apply the umask */
+#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */
#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */
#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */
#define SB_I_VERSION BIT(23) /* Update inode I_version field */
@@ -1163,11 +1222,14 @@ extern int send_sigurg(struct fown_struct *fown);
#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020
#define SB_I_UNTRUSTED_MOUNTER 0x00000040
+#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080
#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */
#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */
#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */
+#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */
+#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */
/* Possible states of 'frozen' field */
enum {
@@ -1183,7 +1245,8 @@ enum {
struct sb_writers {
unsigned short frozen; /* Is sb frozen? */
- unsigned short freeze_holders; /* Who froze fs? */
+ int freeze_kcount; /* How many kernel freeze requests? */
+ int freeze_ucount; /* How many userspace freeze requests? */
struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};
@@ -1208,7 +1271,7 @@ struct super_block {
#ifdef CONFIG_SECURITY
void *s_security;
#endif
- const struct xattr_handler **s_xattr;
+ const struct xattr_handler * const *s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
const struct fscrypt_operations *s_cop;
struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */
@@ -1222,8 +1285,8 @@ struct super_block {
#endif
struct hlist_bl_head s_roots; /* alternate root dentries for NFS */
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
- struct block_device *s_bdev;
- struct bdev_handle *s_bdev_handle;
+ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */
+ struct file *s_bdev_file;
struct backing_dev_info *s_bdi;
struct mtd_info *s_mtd;
struct hlist_node s_instances;
@@ -1234,7 +1297,7 @@ struct super_block {
/*
* Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
- * s_fsnotify_marks together for cache efficiency. They are frequently
+ * s_fsnotify_info together for cache efficiency. They are frequently
* accessed and rarely modified.
*/
void *s_fs_info; /* Filesystem private info */
@@ -1245,12 +1308,26 @@ struct super_block {
time64_t s_time_min;
time64_t s_time_max;
#ifdef CONFIG_FSNOTIFY
- __u32 s_fsnotify_mask;
- struct fsnotify_mark_connector __rcu *s_fsnotify_marks;
+ u32 s_fsnotify_mask;
+ struct fsnotify_sb_info *s_fsnotify_info;
#endif
+ /*
+ * q: why are s_id and s_sysfs_name not the same? both are human
+ * readable strings that identify the filesystem
+ * a: s_id is allowed to change at runtime; it's used in log messages,
+ * and we want to when a device starts out as single device (s_id is dev
+ * name) but then a device is hot added and we have to switch to
+ * identifying it by UUID
+ * but s_sysfs_name is a handle for programmatic access, and can't
+ * change at runtime
+ */
char s_id[32]; /* Informational name */
uuid_t s_uuid; /* UUID */
+ u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */
+
+ /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
+ char s_sysfs_name[UUID_STRING_LEN + 1];
unsigned int s_max_links;
@@ -1268,17 +1345,11 @@ struct super_block {
const struct dentry_operations *s_d_op; /* default d_op for dentries */
- struct shrinker s_shrink; /* per-sb shrinker handle */
+ struct shrinker *s_shrink; /* per-sb shrinker handle */
/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;
- /*
- * Number of inode/mount/sb objects that are being watched, note that
- * inodes objects are currently double-accounted.
- */
- atomic_long_t s_fsnotify_connectors;
-
/* Read-only state of the superblock is being changed */
int s_readonly_remount;
@@ -1516,23 +1587,27 @@ struct timespec64 inode_set_ctime_current(struct inode *inode);
static inline time64_t inode_get_atime_sec(const struct inode *inode)
{
- return inode->i_atime.tv_sec;
+ return inode->i_atime_sec;
}
static inline long inode_get_atime_nsec(const struct inode *inode)
{
- return inode->i_atime.tv_nsec;
+ return inode->i_atime_nsec;
}
static inline struct timespec64 inode_get_atime(const struct inode *inode)
{
- return inode->i_atime;
+ struct timespec64 ts = { .tv_sec = inode_get_atime_sec(inode),
+ .tv_nsec = inode_get_atime_nsec(inode) };
+
+ return ts;
}
static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
struct timespec64 ts)
{
- inode->i_atime = ts;
+ inode->i_atime_sec = ts.tv_sec;
+ inode->i_atime_nsec = ts.tv_nsec;
return ts;
}
@@ -1541,28 +1616,32 @@ static inline struct timespec64 inode_set_atime(struct inode *inode,
{
struct timespec64 ts = { .tv_sec = sec,
.tv_nsec = nsec };
+
return inode_set_atime_to_ts(inode, ts);
}
static inline time64_t inode_get_mtime_sec(const struct inode *inode)
{
- return inode->i_mtime.tv_sec;
+ return inode->i_mtime_sec;
}
static inline long inode_get_mtime_nsec(const struct inode *inode)
{
- return inode->i_mtime.tv_nsec;
+ return inode->i_mtime_nsec;
}
static inline struct timespec64 inode_get_mtime(const struct inode *inode)
{
- return inode->i_mtime;
+ struct timespec64 ts = { .tv_sec = inode_get_mtime_sec(inode),
+ .tv_nsec = inode_get_mtime_nsec(inode) };
+ return ts;
}
static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
struct timespec64 ts)
{
- inode->i_mtime = ts;
+ inode->i_mtime_sec = ts.tv_sec;
+ inode->i_mtime_nsec = ts.tv_nsec;
return ts;
}
@@ -1576,23 +1655,27 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode,
static inline time64_t inode_get_ctime_sec(const struct inode *inode)
{
- return inode->__i_ctime.tv_sec;
+ return inode->i_ctime_sec;
}
static inline long inode_get_ctime_nsec(const struct inode *inode)
{
- return inode->__i_ctime.tv_nsec;
+ return inode->i_ctime_nsec;
}
static inline struct timespec64 inode_get_ctime(const struct inode *inode)
{
- return inode->__i_ctime;
+ struct timespec64 ts = { .tv_sec = inode_get_ctime_sec(inode),
+ .tv_nsec = inode_get_ctime_nsec(inode) };
+
+ return ts;
}
static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
struct timespec64 ts)
{
- inode->__i_ctime = ts;
+ inode->i_ctime_sec = ts.tv_sec;
+ inode->i_ctime_nsec = ts.tv_nsec;
return ts;
}
@@ -1641,11 +1724,72 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
#define __sb_writers_acquired(sb, lev) \
percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_writers_release(sb, lev) \
- percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
+ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_)
+
+/**
+ * __sb_write_started - check if sb freeze level is held
+ * @sb: the super we write to
+ * @level: the freeze level
+ *
+ * * > 0 - sb freeze level is held
+ * * 0 - sb freeze level is not held
+ * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
+ */
+static inline int __sb_write_started(const struct super_block *sb, int level)
+{
+ return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
+}
+/**
+ * sb_write_started - check if SB_FREEZE_WRITE is held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
static inline bool sb_write_started(const struct super_block *sb)
{
- return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1);
+ return __sb_write_started(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_write_not_started - check if SB_FREEZE_WRITE is not held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
+static inline bool sb_write_not_started(const struct super_block *sb)
+{
+ return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
+}
+
+/**
+ * file_write_started - check if SB_FREEZE_WRITE is held
+ * @file: the file we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ * May be false positive with !S_ISREG, because file_start_write() has
+ * no effect on !S_ISREG.
+ */
+static inline bool file_write_started(const struct file *file)
+{
+ if (!S_ISREG(file_inode(file)->i_mode))
+ return true;
+ return sb_write_started(file_inode(file)->i_sb);
+}
+
+/**
+ * file_write_not_started - check if SB_FREEZE_WRITE is not held
+ * @file: the file we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ * May be false positive with !S_ISREG, because file_start_write() has
+ * no effect on !S_ISREG.
+ */
+static inline bool file_write_not_started(const struct file *file)
+{
+ if (!S_ISREG(file_inode(file)->i_mode))
+ return true;
+ return sb_write_not_started(file_inode(file)->i_sb);
}
/**
@@ -1816,7 +1960,7 @@ struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
umode_t mode, int open_flag,
const struct cred *cred);
struct file *kernel_file_open(const struct path *path, int flags,
- struct inode *inode, const struct cred *cred);
+ const struct cred *cred);
int vfs_mkobj(struct dentry *, umode_t,
int (*f)(struct dentry *, umode_t, void *),
@@ -1843,6 +1987,8 @@ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
extern bool may_open_dev(const struct path *path);
umode_t mode_strip_sgid(struct mnt_idmap *idmap,
const struct inode *dir, umode_t mode);
+bool in_group_or_capable(struct mnt_idmap *idmap,
+ const struct inode *inode, vfsgid_t vfsgid);
/*
* This is the "filldir" function type, used by readdir() to let
@@ -1911,8 +2057,11 @@ struct iov_iter;
struct io_uring_cmd;
struct offset_ctx;
+typedef unsigned int __bitwise fop_flags_t;
+
struct file_operations {
struct module *owner;
+ fop_flags_t fop_flags;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
@@ -1925,7 +2074,6 @@ struct file_operations {
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
- unsigned long mmap_supported_flags;
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
@@ -1938,7 +2086,7 @@ struct file_operations {
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
void (*splice_eof)(struct file *file);
- int (*setlease)(struct file *, int, struct file_lock **, void **);
+ int (*setlease)(struct file *, int, struct file_lease **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
@@ -1956,6 +2104,19 @@ struct file_operations {
unsigned int poll_flags);
} __randomize_layout;
+/* Supports async buffered reads */
+#define FOP_BUFFER_RASYNC ((__force fop_flags_t)(1 << 0))
+/* Supports async buffered writes */
+#define FOP_BUFFER_WASYNC ((__force fop_flags_t)(1 << 1))
+/* Supports synchronous page faults for mappings */
+#define FOP_MMAP_SYNC ((__force fop_flags_t)(1 << 2))
+/* Supports non-exclusive O_DIRECT writes from multiple threads */
+#define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3))
+/* Contains huge pages */
+#define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4))
+/* Treat loff_t as unsigned (e.g., /dev/mem) */
+#define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5))
+
/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
int (*) (struct file *, struct dir_context *));
@@ -2006,18 +2167,6 @@ struct inode_operations {
struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
} ____cacheline_aligned;
-static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
- struct iov_iter *iter)
-{
- return file->f_op->read_iter(kio, iter);
-}
-
-static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
- struct iov_iter *iter)
-{
- return file->f_op->write_iter(kio, iter);
-}
-
static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
return file->f_op->mmap(file, vma);
@@ -2027,9 +2176,7 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
loff_t, size_t, unsigned int);
-extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags);
+int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write);
int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags,
@@ -2037,9 +2184,6 @@ int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *count, unsigned int remap_flags);
-extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags);
extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
@@ -2049,9 +2193,24 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos,
loff_t len, unsigned int remap_flags);
+/**
+ * enum freeze_holder - holder of the freeze
+ * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
+ * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
+ * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
+ *
+ * Indicate who the owner of the freeze or thaw request is and whether
+ * the freeze needs to be exclusive or can nest.
+ * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
+ * same holder aren't allowed. It is however allowed to hold a single
+ * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
+ * the same time. This is relied upon by some filesystems during online
+ * repair or similar.
+ */
enum freeze_holder {
FREEZE_HOLDER_KERNEL = (1U << 0),
FREEZE_HOLDER_USERSPACE = (1U << 1),
+ FREEZE_MAY_NEST = (1U << 2),
};
struct super_operations {
@@ -2143,11 +2302,22 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
#define IS_APPEND(inode) ((inode)->i_flags & S_APPEND)
#define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE)
+
+#ifdef CONFIG_FS_POSIX_ACL
#define IS_POSIXACL(inode) __IS_FLG(inode, SB_POSIXACL)
+#else
+#define IS_POSIXACL(inode) 0
+#endif
#define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
+
+#ifdef CONFIG_SWAP
#define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
+#else
+#define IS_SWAPFILE(inode) ((void)(inode), 0U)
+#endif
+
#define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
#define IS_IMA(inode) ((inode)->i_flags & S_IMA)
#define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT)
@@ -2245,8 +2415,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
*
* I_REFERENCED Marks the inode as recently references on the LRU list.
*
- * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit().
- *
* I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
* synchronize competing switching instances and to tell
* wb stat updates to grab the i_pages lock. See
@@ -2269,30 +2437,32 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
* i_count.
*
* Q: What is the difference between I_WILL_FREE and I_FREEING?
+ *
+ * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
+ * upon. There's one free address left.
*/
-#define I_DIRTY_SYNC (1 << 0)
-#define I_DIRTY_DATASYNC (1 << 1)
-#define I_DIRTY_PAGES (1 << 2)
-#define __I_NEW 3
+#define __I_NEW 0
#define I_NEW (1 << __I_NEW)
-#define I_WILL_FREE (1 << 4)
-#define I_FREEING (1 << 5)
-#define I_CLEAR (1 << 6)
-#define __I_SYNC 7
+#define __I_SYNC 1
#define I_SYNC (1 << __I_SYNC)
-#define I_REFERENCED (1 << 8)
-#define __I_DIO_WAKEUP 9
-#define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP)
+#define __I_LRU_ISOLATING 2
+#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING)
+
+#define I_DIRTY_SYNC (1 << 3)
+#define I_DIRTY_DATASYNC (1 << 4)
+#define I_DIRTY_PAGES (1 << 5)
+#define I_WILL_FREE (1 << 6)
+#define I_FREEING (1 << 7)
+#define I_CLEAR (1 << 8)
+#define I_REFERENCED (1 << 9)
#define I_LINKABLE (1 << 10)
#define I_DIRTY_TIME (1 << 11)
-#define I_WB_SWITCH (1 << 13)
-#define I_OVL_INUSE (1 << 14)
-#define I_CREATING (1 << 15)
-#define I_DONTCACHE (1 << 16)
-#define I_SYNC_QUEUED (1 << 17)
-#define I_PINNING_FSCACHE_WB (1 << 18)
-#define __I_LRU_ISOLATING 19
-#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING)
+#define I_WB_SWITCH (1 << 12)
+#define I_OVL_INUSE (1 << 13)
+#define I_CREATING (1 << 14)
+#define I_DONTCACHE (1 << 15)
+#define I_SYNC_QUEUED (1 << 16)
+#define I_PINNING_NETFS_WB (1 << 17)
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
@@ -2426,10 +2596,17 @@ struct super_block *sget(struct file_system_type *type,
struct super_block *sget_dev(struct fs_context *fc, dev_t dev);
/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
-#define fops_get(fops) \
- (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
-#define fops_put(fops) \
- do { if (fops) module_put((fops)->owner); } while(0)
+#define fops_get(fops) ({ \
+ const struct file_operations *_fops = (fops); \
+ (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL)); \
+})
+
+#define fops_put(fops) ({ \
+ const struct file_operations *_fops = (fops); \
+ if (_fops) \
+ module_put((_fops)->owner); \
+})
+
/*
* This one is to be used *ONLY* from ->open() instances.
* fops must be non-NULL, pinned down *and* module dependencies
@@ -2453,6 +2630,44 @@ extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);
+static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len)
+{
+ if (WARN_ON(len > sizeof(sb->s_uuid)))
+ len = sizeof(sb->s_uuid);
+ sb->s_uuid_len = len;
+ memcpy(&sb->s_uuid, uuid, len);
+}
+
+/* set sb sysfs name based on sb->s_bdev */
+static inline void super_set_sysfs_name_bdev(struct super_block *sb)
+{
+ snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev);
+}
+
+/* set sb sysfs name based on sb->s_uuid */
+static inline void super_set_sysfs_name_uuid(struct super_block *sb)
+{
+ WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid));
+ snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b);
+}
+
+/* set sb sysfs name based on sb->s_id */
+static inline void super_set_sysfs_name_id(struct super_block *sb)
+{
+ strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name));
+}
+
+/* try to use something standard before you use this */
+__printf(2, 3)
+static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args);
+ va_end(args);
+}
+
extern int current_umask(void);
extern void ihold(struct inode * inode);
@@ -2476,7 +2691,7 @@ struct filename {
};
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
-static inline struct mnt_idmap *file_mnt_idmap(struct file *file)
+static inline struct mnt_idmap *file_mnt_idmap(const struct file *file)
{
return mnt_idmap(file->f_path.mnt);
}
@@ -2515,26 +2730,31 @@ struct file *dentry_open(const struct path *path, int flags,
const struct cred *creds);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
const struct cred *cred);
-struct file *backing_file_open(const struct path *path, int flags,
- const struct path *real_path,
- const struct cred *cred);
-struct path *backing_file_real_path(struct file *f);
+struct path *backing_file_user_path(struct file *f);
/*
- * file_real_path - get the path corresponding to f_inode
- *
- * When opening a backing file for a stackable filesystem (e.g.,
- * overlayfs) f_path may be on the stackable filesystem and f_inode on
- * the underlying filesystem. When the path associated with f_inode is
- * needed, this helper should be used instead of accessing f_path
- * directly.
-*/
-static inline const struct path *file_real_path(struct file *f)
+ * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
+ * stored in ->vm_file is a backing file whose f_inode is on the underlying
+ * filesystem. When the mapped file path and inode number are displayed to
+ * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the
+ * path and inode number to display to the user, which is the path of the fd
+ * that user has requested to map and the inode number that would be returned
+ * by fstat() on that same fd.
+ */
+/* Get the path to display in /proc/<pid>/maps */
+static inline const struct path *file_user_path(struct file *f)
{
if (unlikely(f->f_mode & FMODE_BACKING))
- return backing_file_real_path(f);
+ return backing_file_user_path(f);
return &f->f_path;
}
+/* Get the inode whose inode number to display in /proc/<pid>/maps */
+static inline const struct inode *file_user_inode(struct file *f)
+{
+ if (unlikely(f->f_mode & FMODE_BACKING))
+ return d_inode(backing_file_user_path(f)->dentry);
+ return file_inode(f);
+}
static inline struct file *file_clone_open(struct file *file)
{
@@ -2542,7 +2762,7 @@ static inline struct file *file_clone_open(struct file *file)
}
extern int filp_close(struct file *, fl_owner_t id);
-extern struct filename *getname_flags(const char __user *, int, int *);
+extern struct filename *getname_flags(const char __user *, int);
extern struct filename *getname_uflags(const char __user *, int);
extern struct filename *getname(const char __user *);
extern struct filename *getname_kernel(const char *);
@@ -2862,7 +3082,12 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int whence);
extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
-extern int inode_init_always(struct super_block *, struct inode *);
+extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
+static inline int inode_init_always(struct super_block *sb, struct inode *inode)
+{
+ return inode_init_always_gfp(sb, inode, GFP_NOFS);
+}
+
extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
@@ -2886,7 +3111,12 @@ extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *),
void *data);
-extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
+struct inode *iget5_locked(struct super_block *, unsigned long,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *);
+struct inode *iget5_locked_rcu(struct super_block *, unsigned long,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *);
extern struct inode * iget_locked(struct super_block *, unsigned long);
extern struct inode *find_inode_nowait(struct super_block *,
unsigned long,
@@ -2924,7 +3154,14 @@ static inline bool is_zero_ino(ino_t ino)
return (u32)ino == 0;
}
-extern void __iget(struct inode * inode);
+/*
+ * inode->i_lock must be held
+ */
+static inline void __iget(struct inode *inode)
+{
+ atomic_inc(&inode->i_count);
+}
+
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
@@ -2932,6 +3169,7 @@ extern struct inode *new_inode_pseudo(struct super_block *sb);
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
+extern int file_remove_privs_flags(struct file *file, unsigned int flags);
extern int file_remove_privs(struct file *);
int setattr_should_drop_sgid(struct mnt_idmap *idmap,
const struct inode *inode);
@@ -2940,11 +3178,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap,
* This must be used for allocating filesystems specific inodes to set
* up the inode reclaim context correctly.
*/
-static inline void *
-alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp)
-{
- return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp);
-}
+#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp)
extern void __insert_inode_hash(struct inode *, unsigned long hashval);
static inline void insert_inode_hash(struct inode *inode)
@@ -3000,18 +3234,17 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
size_t len, unsigned int flags);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
struct file *, loff_t *, size_t, unsigned int);
-extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
- loff_t *opos, size_t len, unsigned int flags);
extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
-#define no_llseek NULL
extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
int whence, loff_t maxsize, loff_t eof);
+loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
+ u64 *cookie);
extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
int whence, loff_t size);
extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
@@ -3049,7 +3282,9 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
}
#endif
+bool inode_dio_finished(const struct inode *inode);
void inode_dio_wait(struct inode *inode);
+void inode_dio_wait_interruptible(struct inode *inode);
/**
* inode_dio_begin - signal start of a direct I/O requests
@@ -3073,7 +3308,7 @@ static inline void inode_dio_begin(struct inode *inode)
static inline void inode_dio_end(struct inode *inode)
{
if (atomic_dec_and_test(&inode->i_dio_count))
- wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
+ wake_up_var(&inode->i_dio_count);
}
extern void inode_set_flags(struct inode *inode, unsigned int flags,
@@ -3085,6 +3320,8 @@ extern const struct file_operations generic_ro_fops;
extern int readlink_copy(char __user *, int, const char *);
extern int page_readlink(struct dentry *, char __user *, int);
+extern const char *page_get_link_raw(struct dentry *, struct inode *,
+ struct delayed_call *);
extern const char *page_get_link(struct dentry *, struct inode *,
struct delayed_call *);
extern void page_put_link(void *);
@@ -3093,6 +3330,9 @@ extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
+void generic_fill_statx_atomic_writes(struct kstat *stat,
+ unsigned int unit_min,
+ unsigned int unit_max);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
@@ -3130,7 +3370,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int);
extern struct file_system_type *get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
-extern struct super_block *get_active_super(struct block_device *bdev);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
extern void iterate_supers(void (*)(struct super_block *, void *), void *);
@@ -3164,11 +3403,13 @@ extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern int simple_empty(struct dentry *);
extern int simple_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata);
+ struct folio **foliop, void **fsdata);
extern const struct address_space_operations ram_aops;
extern int always_delete_dentry(const struct dentry *);
extern struct inode *alloc_anon_inode(struct super_block *);
-extern int simple_nosetlease(struct file *, int, struct file_lock **, void **);
+struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
+ const struct inode *context_inode);
+extern int simple_nosetlease(struct file *, int, struct file_lease **, void **);
extern const struct dentry_operations simple_dentry_operations;
extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
@@ -3190,13 +3431,15 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
const void __user *from, size_t count);
struct offset_ctx {
- struct xarray xa;
- u32 next_offset;
+ struct maple_tree mt;
+ unsigned long next_offset;
};
void simple_offset_init(struct offset_ctx *octx);
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
+int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry);
int simple_offset_rename_exchange(struct inode *old_dir,
struct dentry *old_dentry,
struct inode *new_dir,
@@ -3210,7 +3453,11 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int);
extern int generic_check_addressable(unsigned, u64);
-extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
+extern void generic_set_sb_d_ops(struct super_block *sb);
+extern int generic_ci_match(const struct inode *parent,
+ const struct qstr *name,
+ const struct qstr *folded_name,
+ const u8 *de_name, u32 de_name_len);
static inline bool sb_has_encoding(const struct super_block *sb)
{
@@ -3263,7 +3510,8 @@ static inline int iocb_flags(struct file *file)
return res;
}
-static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
+static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
+ int rw_type)
{
int kiocb_flags = 0;
@@ -3274,34 +3522,33 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
return 0;
if (unlikely(flags & ~RWF_SUPPORTED))
return -EOPNOTSUPP;
+ if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND)))
+ return -EINVAL;
if (flags & RWF_NOWAIT) {
if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
return -EOPNOTSUPP;
- kiocb_flags |= IOCB_NOIO;
+ }
+ if (flags & RWF_ATOMIC) {
+ if (rw_type != WRITE)
+ return -EOPNOTSUPP;
+ if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
+ return -EOPNOTSUPP;
}
kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
if (flags & RWF_SYNC)
kiocb_flags |= IOCB_DSYNC;
+ if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) {
+ if (IS_APPEND(file_inode(ki->ki_filp)))
+ return -EPERM;
+ ki->ki_flags &= ~IOCB_APPEND;
+ }
+
ki->ki_flags |= kiocb_flags;
return 0;
}
-static inline ino_t parent_ino(struct dentry *dentry)
-{
- ino_t res;
-
- /*
- * Don't strictly need d_lock here? If the parent ino could change
- * then surely we'd have a deeper race in the caller?
- */
- spin_lock(&dentry->d_lock);
- res = dentry->d_parent->d_inode->i_ino;
- spin_unlock(&dentry->d_lock);
- return res;
-}
-
/* Transaction based IO helpers */
/*
@@ -3426,7 +3673,7 @@ static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
{
return ctx->actor(ctx, "..", 2, ctx->pos,
- parent_ino(file->f_path.dentry), DT_DIR);
+ d_parent_ino(file->f_path.dentry), DT_DIR);
}
static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
{
@@ -3465,4 +3712,23 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
int advice);
+static inline bool vfs_empty_path(int dfd, const char __user *path)
+{
+ char c;
+
+ if (dfd < 0)
+ return false;
+
+ /* We now allow NULL to be used for empty path. */
+ if (!path)
+ return true;
+
+ if (unlikely(get_user(c, path)))
+ return false;
+
+ return !c;
+}
+
+int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
+
#endif /* _LINUX_FS_H */