summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.h2
-rw-r--r--fs/9p/vfs_dentry.c26
-rw-r--r--fs/afs/dir.c40
-rw-r--r--fs/aio.c2
-rw-r--r--fs/anon_inodes.c4
-rw-r--r--fs/bcachefs/Kconfig7
-rw-r--r--fs/bcachefs/alloc_background.c47
-rw-r--r--fs/bcachefs/alloc_foreground.c10
-rw-r--r--fs/bcachefs/alloc_types.h1
-rw-r--r--fs/bcachefs/btree_cache.c5
-rw-r--r--fs/bcachefs/btree_iter.c36
-rw-r--r--fs/bcachefs/btree_iter.h14
-rw-r--r--fs/bcachefs/btree_key_cache.c5
-rw-r--r--fs/bcachefs/btree_trans_commit.c6
-rw-r--r--fs/bcachefs/btree_types.h3
-rw-r--r--fs/bcachefs/btree_update_interior.c2
-rw-r--r--fs/bcachefs/btree_update_interior.h4
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c12
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.h4
-rw-r--r--fs/bcachefs/compress.c31
-rw-r--r--fs/bcachefs/compress.h4
-rw-r--r--fs/bcachefs/data_update.c50
-rw-r--r--fs/bcachefs/debug.c1
-rw-r--r--fs/bcachefs/disk_accounting.h2
-rw-r--r--fs/bcachefs/fsck.c80
-rw-r--r--fs/bcachefs/inode.h4
-rw-r--r--fs/bcachefs/io_write.c16
-rw-r--r--fs/bcachefs/io_write.h2
-rw-r--r--fs/bcachefs/journal.c110
-rw-r--r--fs/bcachefs/journal.h10
-rw-r--r--fs/bcachefs/journal_io.c2
-rw-r--r--fs/bcachefs/journal_reclaim.c153
-rw-r--r--fs/bcachefs/journal_reclaim.h3
-rw-r--r--fs/bcachefs/journal_types.h17
-rw-r--r--fs/bcachefs/movinggc.c11
-rw-r--r--fs/bcachefs/opts.h18
-rw-r--r--fs/bcachefs/rebalance.c8
-rw-r--r--fs/bcachefs/rebalance.h20
-rw-r--r--fs/bcachefs/recovery.c1
-rw-r--r--fs/bcachefs/reflink.c16
-rw-r--r--fs/bcachefs/sb-downgrade.c2
-rw-r--r--fs/bcachefs/sb-errors_format.h6
-rw-r--r--fs/bcachefs/str_hash.c24
-rw-r--r--fs/bcachefs/subvolume.c7
-rw-r--r--fs/bcachefs/super.c11
-rw-r--r--fs/bcachefs/super.h1
-rw-r--r--fs/bcachefs/trace.h40
-rw-r--r--fs/bcachefs/util.h2
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/extent_io.c29
-rw-r--r--fs/btrfs/file.c4
-rw-r--r--fs/btrfs/ordered-data.c12
-rw-r--r--fs/btrfs/qgroup.c11
-rw-r--r--fs/btrfs/transaction.c4
-rw-r--r--fs/cachefiles/error_inject.c2
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c25
-rw-r--r--fs/ceph/mds_client.c41
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/coda/dir.c3
-rw-r--r--fs/coda/sysctl.c2
-rw-r--r--fs/coredump.c2
-rw-r--r--fs/crypto/fname.c22
-rw-r--r--fs/dcache.c107
-rw-r--r--fs/debugfs/file.c20
-rw-r--r--fs/devpts/inode.c2
-rw-r--r--fs/ecryptfs/dentry.c18
-rw-r--r--fs/erofs/xattr.c2
-rw-r--r--fs/erofs/zdata.c2
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/exec.c2
-rw-r--r--fs/exfat/namei.c11
-rw-r--r--fs/ext4/fast_commit.c29
-rw-r--r--fs/ext4/fast_commit.h3
-rw-r--r--fs/fat/namei_vfat.c19
-rw-r--r--fs/file_table.c22
-rw-r--r--fs/fuse/Kconfig12
-rw-r--r--fs/fuse/Makefile1
-rw-r--r--fs/fuse/dax.c11
-rw-r--r--fs/fuse/dev.c127
-rw-r--r--fs/fuse/dev_uring.c1319
-rw-r--r--fs/fuse/dev_uring_i.h205
-rw-r--r--fs/fuse/dir.c48
-rw-r--r--fs/fuse/fuse_dev_i.h66
-rw-r--r--fs/fuse/fuse_i.h32
-rw-r--r--fs/fuse/inode.c14
-rw-r--r--fs/fuse/sysctl.c2
-rw-r--r--fs/fuse/xattr.c7
-rw-r--r--fs/gfs2/dentry.c31
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/quota.c4
-rw-r--r--fs/hfs/sysdep.c3
-rw-r--r--fs/hostfs/hostfs_kern.c81
-rw-r--r--fs/inode.c2
-rw-r--r--fs/jfs/namei.c3
-rw-r--r--fs/kernfs/dir.c3
-rw-r--r--fs/kernfs/file.c2
-rw-r--r--fs/libfs.c15
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/locks.c2
-rw-r--r--fs/namei.c20
-rw-r--r--fs/namespace.c56
-rw-r--r--fs/netfs/buffered_read.c19
-rw-r--r--fs/netfs/internal.h4
-rw-r--r--fs/netfs/read_collect.c6
-rw-r--r--fs/netfs/read_retry.c43
-rw-r--r--fs/netfs/stats.c9
-rw-r--r--fs/netfs/write_issue.c1
-rw-r--r--fs/netfs/write_retry.c2
-rw-r--r--fs/nfs/dir.c62
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3proc.c5
-rw-r--r--fs/nfs/nfs4proc.c20
-rw-r--r--fs/nfs/nfs4sysctl.c2
-rw-r--r--fs/nfs/proc.c6
-rw-r--r--fs/nfs/sysctl.c2
-rw-r--r--fs/nfsd/filecache.c11
-rw-r--r--fs/nfsd/nfs2acl.c2
-rw-r--r--fs/nfsd/nfs3acl.c2
-rw-r--r--fs/nfsd/nfs4callback.c9
-rw-r--r--fs/nfsd/nfs4state.c3
-rw-r--r--fs/nfsd/nfsfh.c5
-rw-r--r--fs/nilfs2/inode.c6
-rw-r--r--fs/notify/dnotify/dnotify.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/fsnotify.c18
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/ntfs3/attrib.c15
-rw-r--r--fs/ntfs3/dir.c2
-rw-r--r--fs/ntfs3/frecord.c74
-rw-r--r--fs/ntfs3/fsntfs.c6
-rw-r--r--fs/ntfs3/index.c6
-rw-r--r--fs/ntfs3/inode.c3
-rw-r--r--fs/ntfs3/ntfs_fs.h21
-rw-r--r--fs/ntfs3/record.c79
-rw-r--r--fs/ocfs2/dcache.c14
-rw-r--r--fs/ocfs2/stackglue.c2
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/open.c11
-rw-r--r--fs/orangefs/dcache.c22
-rw-r--r--fs/orangefs/orangefs-debugfs.c4
-rw-r--r--fs/overlayfs/namei.c2
-rw-r--r--fs/overlayfs/super.c22
-rw-r--r--fs/pidfs.c12
-rw-r--r--fs/pipe.c8
-rw-r--r--fs/proc/base.c6
-rw-r--r--fs/proc/fd.c3
-rw-r--r--fs/proc/generic.c6
-rw-r--r--fs/proc/proc_sysctl.c3
-rw-r--r--fs/proc/vmcore.c5
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/smb/client/asn1.c2
-rw-r--r--fs/smb/client/cifs_spnego.c4
-rw-r--r--fs/smb/client/cifsacl.c25
-rw-r--r--fs/smb/client/cifsfs.c6
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/cifsglob.h82
-rw-r--r--fs/smb/client/cifspdu.h102
-rw-r--r--fs/smb/client/cifsproto.h4
-rw-r--r--fs/smb/client/cifssmb.c4
-rw-r--r--fs/smb/client/connect.c4
-rw-r--r--fs/smb/client/dfs.c30
-rw-r--r--fs/smb/client/dfs.h7
-rw-r--r--fs/smb/client/dfs_cache.c27
-rw-r--r--fs/smb/client/dir.c3
-rw-r--r--fs/smb/client/file.c7
-rw-r--r--fs/smb/client/fs_context.c104
-rw-r--r--fs/smb/client/fs_context.h21
-rw-r--r--fs/smb/client/inode.c24
-rw-r--r--fs/smb/client/link.c60
-rw-r--r--fs/smb/client/netmisc.c10
-rw-r--r--fs/smb/client/nterr.c1
-rw-r--r--fs/smb/client/nterr.h1
-rw-r--r--fs/smb/client/reparse.c522
-rw-r--r--fs/smb/client/reparse.h30
-rw-r--r--fs/smb/client/sess.c3
-rw-r--r--fs/smb/client/smb1ops.c32
-rw-r--r--fs/smb/client/smb2file.c52
-rw-r--r--fs/smb/client/smb2inode.c9
-rw-r--r--fs/smb/client/smb2maperror.c4
-rw-r--r--fs/smb/client/smb2ops.c44
-rw-r--r--fs/smb/client/smb2pdu.c6
-rw-r--r--fs/smb/client/smb2proto.h5
-rw-r--r--fs/smb/common/smb2pdu.h44
-rw-r--r--fs/smb/common/smbfsctl.h3
-rw-r--r--fs/stat.c4
-rw-r--r--fs/sysctls.c2
-rw-r--r--fs/tracefs/inode.c3
-rw-r--r--fs/ubifs/debug.c23
-rw-r--r--fs/ubifs/lpt_commit.c1
-rw-r--r--fs/userfaultfd.c2
-rw-r--r--fs/vboxsf/dir.c3
-rw-r--r--fs/vboxsf/super.c3
-rw-r--r--fs/verity/init.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c13
-rw-r--r--fs/xfs/scrub/common.h5
-rw-r--r--fs/xfs/scrub/inode_repair.c12
-rw-r--r--fs/xfs/scrub/repair.h11
-rw-r--r--fs/xfs/scrub/scrub.c12
-rw-r--r--fs/xfs/xfs_aops.c41
-rw-r--r--fs/xfs/xfs_buf.c36
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_exchrange.c71
-rw-r--r--fs/xfs/xfs_inode.c7
-rw-r--r--fs/xfs/xfs_iomap.c6
-rw-r--r--fs/xfs/xfs_qm_bhv.c55
-rw-r--r--fs/xfs/xfs_super.c8
-rw-r--r--fs/xfs/xfs_sysctl.c2
209 files changed, 4186 insertions, 1353 deletions
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 698c43dd5dc8..f28bc763847a 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -202,7 +202,7 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
return inode->i_sb->s_fs_info;
}
-static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
+static inline struct v9fs_session_info *v9fs_dentry2v9ses(const struct dentry *dentry)
{
return dentry->d_sb->s_fs_info;
}
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 01338d4c2d9e..5061f192eafd 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -61,7 +61,7 @@ static void v9fs_dentry_release(struct dentry *dentry)
p9_fid_put(hlist_entry(p, struct p9_fid, dlist));
}
-static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
{
struct p9_fid *fid;
struct inode *inode;
@@ -99,14 +99,36 @@ out_valid:
return 1;
}
+static int v9fs_lookup_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
+{
+ return __v9fs_lookup_revalidate(dentry, flags);
+}
+
+static bool v9fs_dentry_unalias_trylock(const struct dentry *dentry)
+{
+ struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+ return down_write_trylock(&v9ses->rename_sem);
+}
+
+static void v9fs_dentry_unalias_unlock(const struct dentry *dentry)
+{
+ struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+ up_write(&v9ses->rename_sem);
+}
+
const struct dentry_operations v9fs_cached_dentry_operations = {
.d_revalidate = v9fs_lookup_revalidate,
- .d_weak_revalidate = v9fs_lookup_revalidate,
+ .d_weak_revalidate = __v9fs_lookup_revalidate,
.d_delete = v9fs_cached_dentry_delete,
.d_release = v9fs_dentry_release,
+ .d_unalias_trylock = v9fs_dentry_unalias_trylock,
+ .d_unalias_unlock = v9fs_dentry_unalias_unlock,
};
const struct dentry_operations v9fs_dentry_operations = {
.d_delete = always_delete_dentry,
.d_release = v9fs_dentry_release,
+ .d_unalias_trylock = v9fs_dentry_unalias_trylock,
+ .d_unalias_unlock = v9fs_dentry_unalias_unlock,
};
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index a843c36fc471..02cbf38e1a77 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -23,7 +23,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
static int afs_dir_open(struct inode *inode, struct file *file);
static int afs_readdir(struct file *file, struct dir_context *ctx);
-static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
+static int afs_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
static void afs_d_iput(struct dentry *dentry, struct inode *inode);
static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
@@ -597,19 +598,19 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
* Do a lookup of a single name in a directory
* - just returns the FID the dentry name maps to if found
*/
-static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
+static int afs_do_lookup_one(struct inode *dir, const struct qstr *name,
struct afs_fid *fid,
afs_dataversion_t *_dir_version)
{
struct afs_super_info *as = dir->i_sb->s_fs_info;
struct afs_lookup_one_cookie cookie = {
.ctx.actor = afs_lookup_one_filldir,
- .name = dentry->d_name,
+ .name = *name,
.fid.vid = as->volume->vid
};
int ret;
- _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry);
+ _enter("{%lu},{%.*s},", dir->i_ino, name->len, name->name);
/* search the directory */
ret = afs_dir_iterate(dir, &cookie.ctx, NULL, _dir_version);
@@ -1023,21 +1024,12 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
/*
* Check the validity of a dentry under RCU conditions.
*/
-static int afs_d_revalidate_rcu(struct dentry *dentry)
+static int afs_d_revalidate_rcu(struct afs_vnode *dvnode, struct dentry *dentry)
{
- struct afs_vnode *dvnode;
- struct dentry *parent;
- struct inode *dir;
long dir_version, de_version;
_enter("%p", dentry);
- /* Check the parent directory is still valid first. */
- parent = READ_ONCE(dentry->d_parent);
- dir = d_inode_rcu(parent);
- if (!dir)
- return -ECHILD;
- dvnode = AFS_FS_I(dir);
if (test_bit(AFS_VNODE_DELETED, &dvnode->flags))
return -ECHILD;
@@ -1065,11 +1057,11 @@ static int afs_d_revalidate_rcu(struct dentry *dentry)
* - NOTE! the hit can be a negative hit too, so we can't assume we have an
* inode
*/
-static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
- struct afs_vnode *vnode, *dir;
+ struct afs_vnode *vnode, *dir = AFS_FS_I(parent_dir);
struct afs_fid fid;
- struct dentry *parent;
struct inode *inode;
struct key *key;
afs_dataversion_t dir_version, invalid_before;
@@ -1077,7 +1069,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
int ret;
if (flags & LOOKUP_RCU)
- return afs_d_revalidate_rcu(dentry);
+ return afs_d_revalidate_rcu(dir, dentry);
if (d_really_is_positive(dentry)) {
vnode = AFS_FS_I(d_inode(dentry));
@@ -1092,14 +1084,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (IS_ERR(key))
key = NULL;
- /* Hold the parent dentry so we can peer at it */
- parent = dget_parent(dentry);
- dir = AFS_FS_I(d_inode(parent));
-
/* validate the parent directory */
ret = afs_validate(dir, key);
if (ret == -ERESTARTSYS) {
- dput(parent);
key_put(key);
return ret;
}
@@ -1127,7 +1114,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
afs_stat_v(dir, n_reval);
/* search the directory for this vnode */
- ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, &dir_version);
+ ret = afs_do_lookup_one(&dir->netfs.inode, name, &fid, &dir_version);
switch (ret) {
case 0:
/* the filename maps to something */
@@ -1171,22 +1158,19 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
goto out_valid;
default:
- _debug("failed to iterate dir %pd: %d",
- parent, ret);
+ _debug("failed to iterate parent %pd2: %d", dentry, ret);
goto not_found;
}
out_valid:
dentry->d_fsdata = (void *)(unsigned long)dir_version;
out_valid_noupdate:
- dput(parent);
key_put(key);
_leave(" = 1 [valid]");
return 1;
not_found:
_debug("dropping dentry %pd2", dentry);
- dput(parent);
key_put(key);
_leave(" = 0 [bad]");
diff --git a/fs/aio.c b/fs/aio.c
index 50671640b588..7b976b564cfc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -224,7 +224,7 @@ static unsigned long aio_nr; /* current system wide number of aio requests */
static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
/*----end sysctl variables---*/
#ifdef CONFIG_SYSCTL
-static struct ctl_table aio_sysctls[] = {
+static const struct ctl_table aio_sysctls[] = {
{
.procname = "aio-nr",
.data = &aio_nr,
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 42bd1cb7c9cd..583ac81669c2 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -60,14 +60,14 @@ static struct inode *anon_inode_make_secure_inode(
const struct inode *context_inode)
{
struct inode *inode;
- const struct qstr qname = QSTR_INIT(name, strlen(name));
int error;
inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
- error = security_inode_init_security_anon(inode, &qname, context_inode);
+ error = security_inode_init_security_anon(inode, &QSTR(name),
+ context_inode);
if (error) {
iput(inode);
return ERR_PTR(error);
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 85eea7a4dea3..fc7efd0a7525 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -61,6 +61,13 @@ config BCACHEFS_DEBUG
The resulting code will be significantly slower than normal; you
probably shouldn't select this option unless you're a developer.
+config BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ bool "Randomly inject transaction restarts"
+ depends on BCACHEFS_DEBUG
+ help
+ Randomly inject transaction restarts in a few core paths - may have a
+ significant performance penalty
+
config BCACHEFS_TESTS
bool "bcachefs unit and performance tests"
depends on BCACHEFS_FS
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fc2ef33b67b3..3ea809990ef1 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1803,7 +1803,6 @@ struct discard_buckets_state {
u64 open;
u64 need_journal_commit;
u64 discarded;
- u64 need_journal_commit_this_dev;
};
static int bch2_discard_one_bucket(struct btree_trans *trans,
@@ -1827,11 +1826,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
goto out;
}
- if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- pos.inode, pos.offset)) {
- s->need_journal_commit++;
- s->need_journal_commit_this_dev++;
+ u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
+ pos.inode, pos.offset);
+ if (seq_ready > c->journal.flushed_seq_ondisk) {
+ if (seq_ready > c->journal.flushing_seq)
+ s->need_journal_commit++;
goto out;
}
@@ -1865,23 +1864,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
discard_locked = true;
}
- if (!bkey_eq(*discard_pos_done, iter.pos) &&
- ca->mi.discard && !c->opts.nochanges) {
- /*
- * This works without any other locks because this is the only
- * thread that removes items from the need_discard tree
- */
- bch2_trans_unlock_long(trans);
- blkdev_issue_discard(ca->disk_sb.bdev,
- k.k->p.offset * ca->mi.bucket_size,
- ca->mi.bucket_size,
- GFP_KERNEL);
- *discard_pos_done = iter.pos;
+ if (!bkey_eq(*discard_pos_done, iter.pos)) {
s->discarded++;
+ *discard_pos_done = iter.pos;
- ret = bch2_trans_relock_notrace(trans);
- if (ret)
- goto out;
+ if (ca->mi.discard && !c->opts.nochanges) {
+ /*
+ * This works without any other locks because this is the only
+ * thread that removes items from the need_discard tree
+ */
+ bch2_trans_unlock_long(trans);
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ k.k->p.offset * ca->mi.bucket_size,
+ ca->mi.bucket_size,
+ GFP_KERNEL);
+ ret = bch2_trans_relock_notrace(trans);
+ if (ret)
+ goto out;
+ }
}
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
@@ -1929,6 +1929,9 @@ static void bch2_do_discards_work(struct work_struct *work)
POS(ca->dev_idx, U64_MAX), 0, k,
bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
+ if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
+ bch2_journal_flush_async(&c->journal, NULL);
+
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
@@ -2024,7 +2027,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
break;
}
- trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
+ trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
bch2_trans_put(trans);
percpu_ref_put(&ca->io_ref);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 6df41c331a52..5a781fb4c794 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -205,8 +205,12 @@ static inline bool may_alloc_bucket(struct bch_fs *c,
return false;
}
- if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) {
+ u64 journal_seq_ready =
+ bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
+ bucket.inode, bucket.offset);
+ if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
+ if (journal_seq_ready > c->journal.flushing_seq)
+ s->need_journal_commit++;
s->skipped_need_journal_commit++;
return false;
}
@@ -570,7 +574,7 @@ alloc:
? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
- if (s.skipped_need_journal_commit * 2 > avail)
+ if (s.need_journal_commit * 2 > avail)
bch2_journal_flush_async(&c->journal, NULL);
if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 9bbb28e90b93..4aa8ee026cb8 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -18,6 +18,7 @@ struct bucket_alloc_state {
u64 buckets_seen;
u64 skipped_open;
u64 skipped_need_journal_commit;
+ u64 need_journal_commit;
u64 skipped_nocow;
u64 skipped_nouse;
u64 skipped_mi_btree_bitmap;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 672ca2c1d37d..ca755e8d1a37 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -24,7 +24,10 @@ do { \
} while (0)
const char * const bch2_btree_node_flags[] = {
-#define x(f) #f,
+ "typebit",
+ "typebit",
+ "typebit",
+#define x(f) [BTREE_NODE_##f] = #f,
BTREE_FLAGS()
#undef x
NULL
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 367231ab1980..e32fce4fd258 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2239,8 +2239,6 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
if (unlikely(ret))
return bkey_s_c_err(ret);
- btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
-
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
if (!k.k)
return k;
@@ -2251,6 +2249,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
iter->k = u;
k.k = &iter->k;
+ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
return k;
}
@@ -2358,6 +2357,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
if (iter->update_path) {
bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_intent);
@@ -2623,6 +2628,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
+ int ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
while (1) {
k = __bch2_btree_iter_peek_prev(iter, search_key);
if (unlikely(!k.k))
@@ -2750,6 +2761,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_is_extents) &&
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
@@ -3107,6 +3124,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (ret)
+ return ERR_PTR(ret);
+
struct btree_transaction_stats *s = btree_trans_stats(trans);
s->max_mem = max(s->max_mem, new_bytes);
@@ -3164,7 +3185,8 @@ out_new_mem:
if (old_bytes) {
trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
+ return ERR_PTR(btree_trans_restart_ip(trans,
+ BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
}
out_change_top:
p = trans->mem + trans->mem_top;
@@ -3272,6 +3294,14 @@ u32 bch2_trans_begin(struct btree_trans *trans)
trans->last_begin_ip = _RET_IP_;
+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ if (trans->restarted) {
+ trans->restart_count_this_trans++;
+ } else {
+ trans->restart_count_this_trans = 0;
+ }
+#endif
+
trans_set_locked(trans, false);
if (trans->restarted) {
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index b9538e6e6d65..b96157f3dc9c 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -355,6 +355,18 @@ static int btree_trans_restart(struct btree_trans *trans, int err)
return btree_trans_restart_ip(trans, err, _THIS_IP_);
}
+static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip)
+{
+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) {
+ trace_and_count(trans->c, trans_restart_injected, trans, ip);
+ return btree_trans_restart_ip(trans,
+ BCH_ERR_transaction_restart_fault_inject, ip);
+ }
+#endif
+ return 0;
+}
+
bool bch2_btree_node_upgrade(struct btree_trans *,
struct btree_path *, unsigned);
@@ -739,7 +751,7 @@ transaction_restart: \
if (!_ret2) \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
\
- _ret2 ?: trans_was_restarted(_trans, _restart_count); \
+ _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \
})
#define for_each_btree_key_max_continue(_trans, _iter, \
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 3b62296c3100..1821f40c161a 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -291,8 +291,10 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
struct btree_path *ck_path,
unsigned flags)
{
- if (flags & BTREE_ITER_cached_nofill)
+ if (flags & BTREE_ITER_cached_nofill) {
+ ck_path->l[0].b = NULL;
return 0;
+ }
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -746,7 +748,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
rcu_read_unlock();
mutex_lock(&bc->table.mutex);
mutex_unlock(&bc->table.mutex);
- rcu_read_lock();
continue;
}
for (i = 0; i < tbl->size; i++)
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 6b79b672e0b1..c4f524b2ca9a 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -348,7 +348,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags)
{
return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
- trans->journal_u64s, flags);
+ trans->journal_u64s, flags, trans);
}
#define JSET_ENTRY_LOG_U64s 4
@@ -999,6 +999,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
bch2_trans_verify_not_unlocked_or_in_restart(trans);
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret))
+ goto out_reset;
+
if (!trans->nr_updates &&
!trans->journal_entries_u64s)
goto out_reset;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index a6f251eb4164..a09cbe9cd94f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -509,6 +509,9 @@ struct btree_trans {
bool notrace_relock_fail:1;
enum bch_errcode restarted:16;
u32 restart_count;
+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ u32 restart_count_this_trans;
+#endif
u64 last_begin_time;
unsigned long last_begin_ip;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index f4aeadbe53c1..e4e7c804625e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -681,9 +681,11 @@ static void btree_update_nodes_written(struct btree_update *as)
b = as->old_nodes[i];
+ bch2_trans_begin(trans);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
seq = b->data ? b->data->keys.seq : 0;
six_unlock_read(&b->c.lock);
+ bch2_trans_unlock_long(trans);
if (seq == as->old_nodes_seq[i])
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7930ffea3075..26d646e1275c 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -278,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
- (void *) btree_bkey_last(b, bset_tree_last(b)));
+ (void *) btree_bkey_last(b, t));
ssize_t remaining_space =
__bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
- if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+ if (b->written + block_sectors(c) <= btree_sectors(c))
return bne;
} else {
if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index f9fb150eda70..c8a488e6b7b8 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -22,23 +22,21 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_
memset(t->d, 0, sizeof(t->d[0]) << t->bits);
}
-bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
- u64 flushed_seq,
- unsigned dev, u64 bucket)
+u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b,
+ unsigned dev, u64 bucket)
{
struct buckets_waiting_for_journal_table *t;
u64 dev_bucket = (u64) dev << 56 | bucket;
- bool ret = false;
- unsigned i;
+ u64 ret = 0;
mutex_lock(&b->lock);
t = b->t;
- for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
if (h->dev_bucket == dev_bucket) {
- ret = h->journal_seq > flushed_seq;
+ ret = h->journal_seq;
break;
}
}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
index d2ae19cbe18c..365619ca44c8 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.h
+++ b/fs/bcachefs/buckets_waiting_for_journal.h
@@ -4,8 +4,8 @@
#include "buckets_waiting_for_journal_types.h"
-bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
- u64, unsigned, u64);
+u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *,
+ unsigned, u64);
int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
u64, unsigned, u64, u64);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index f99ff1819597..114bf2f3879f 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -4,6 +4,7 @@
#include "compress.h"
#include "error.h"
#include "extents.h"
+#include "io_write.h"
#include "opts.h"
#include "super-io.h"
@@ -254,11 +255,14 @@ err:
goto out;
}
-int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
- struct bch_extent_crc_unpacked *crc)
+int bch2_bio_uncompress_inplace(struct bch_write_op *op,
+ struct bio *bio)
{
+ struct bch_fs *c = op->c;
+ struct bch_extent_crc_unpacked *crc = &op->crc;
struct bbuf data = { NULL };
size_t dst_len = crc->uncompressed_size << 9;
+ int ret = 0;
/* bio must own its pages: */
BUG_ON(!bio->bi_vcnt);
@@ -266,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
crc->compressed_size << 9 > c->opts.encoded_extent_max) {
- bch_err(c, "error rewriting existing data: extent too big");
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "error rewriting existing data: extent too big");
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
return -EIO;
}
data = __bounce_alloc(c, dst_len, WRITE);
if (__bio_uncompress(c, bio, data.b, *crc)) {
- if (!c->opts.no_data_io)
- bch_err(c, "error rewriting existing data: decompression error");
- bio_unmap_or_unbounce(c, data);
- return -EIO;
+ if (!c->opts.no_data_io) {
+ struct printbuf buf = PRINTBUF;
+ bch2_write_op_error(&buf, op);
+ prt_printf(&buf, "error rewriting existing data: decompression error");
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+ ret = -EIO;
+ goto err;
}
/*
@@ -293,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
crc->uncompressed_size = crc->live_size;
crc->offset = 0;
crc->csum = (struct bch_csum) { 0, 0 };
-
+err:
bio_unmap_or_unbounce(c, data);
- return 0;
+ return ret;
}
int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 607fd5e232c9..bec2f05bfd52 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,8 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
-int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
- struct bch_extent_crc_unpacked *);
+struct bch_write_op;
+int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
struct bvec_iter, struct bch_extent_crc_unpacked);
unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 585214931e05..337494facac6 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -91,15 +91,28 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
return true;
}
-static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
+static noinline void trace_move_extent_finish2(struct data_update *u,
+ struct bkey_i *new,
+ struct bkey_i *insert)
{
- if (trace_move_extent_finish_enabled()) {
- struct printbuf buf = PRINTBUF;
+ struct bch_fs *c = u->op.c;
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
- trace_move_extent_finish(c, buf.buf);
- printbuf_exit(&buf);
- }
+ prt_newline(&buf);
+
+ bch2_data_update_to_text(&buf, u);
+ prt_newline(&buf);
+
+ prt_str_indented(&buf, "new replicas:\t");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+ prt_newline(&buf);
+
+ prt_str_indented(&buf, "insert:\t");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+ prt_newline(&buf);
+
+ trace_move_extent_finish(c, buf.buf);
+ printbuf_exit(&buf);
}
static void trace_move_extent_fail2(struct data_update *m,
@@ -372,7 +385,8 @@ restart_drop_extra_replicas:
bch2_btree_iter_set_pos(&iter, next_pos);
this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
- trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
+ if (trace_move_extent_finish_enabled())
+ trace_move_extent_finish2(m, &new->k_i, insert);
}
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -525,34 +539,38 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
struct data_update_opts *data_opts)
{
printbuf_tabstop_push(out, 20);
- prt_str(out, "rewrite ptrs:\t");
+
+ prt_str_indented(out, "rewrite ptrs:\t");
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
prt_newline(out);
- prt_str(out, "kill ptrs:\t");
+ prt_str_indented(out, "kill ptrs:\t");
bch2_prt_u64_base2(out, data_opts->kill_ptrs);
prt_newline(out);
- prt_str(out, "target:\t");
+ prt_str_indented(out, "target:\t");
bch2_target_to_text(out, c, data_opts->target);
prt_newline(out);
- prt_str(out, "compression:\t");
+ prt_str_indented(out, "compression:\t");
bch2_compression_opt_to_text(out, io_opts->background_compression);
prt_newline(out);
- prt_str(out, "opts.replicas:\t");
+ prt_str_indented(out, "opts.replicas:\t");
prt_u64(out, io_opts->data_replicas);
+ prt_newline(out);
- prt_str(out, "extra replicas:\t");
+ prt_str_indented(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas);
}
void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
{
- bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
- prt_newline(out);
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
+ prt_newline(out);
+
+ prt_str_indented(out, "old key:\t");
+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
}
int bch2_extent_drop_ptrs(struct btree_trans *trans,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index b5de52a50d10..55333e82d1fe 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -20,6 +20,7 @@
#include "extents.h"
#include "fsck.h"
#include "inode.h"
+#include "journal_reclaim.h"
#include "super.h"
#include <linux/console.h>
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 5360cbb3ec29..f4372cafea2e 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -210,11 +210,13 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *
static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
u64 *v, unsigned nr)
{
+ percpu_down_read(&c->mark_lock);
struct bch_accounting_mem *acc = &c->accounting;
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &p);
bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
+ percpu_up_read(&c->mark_lock);
}
static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 8fcf7c8e5ede..9bf316e7b845 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -450,7 +450,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
return ret;
struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
- struct qstr name = (struct qstr) QSTR(name_buf);
+ struct qstr name = QSTR(name_buf);
inode->bi_dir = lostfound.bi_inum;
@@ -823,6 +823,7 @@ struct inode_walker_entry {
struct bch_inode_unpacked inode;
u32 snapshot;
u64 count;
+ u64 i_size;
};
struct inode_walker {
@@ -910,8 +911,9 @@ found:
if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
struct inode_walker_entry new = *i;
- new.snapshot = k.k->p.snapshot;
- new.count = 0;
+ new.snapshot = k.k->p.snapshot;
+ new.count = 0;
+ new.i_size = 0;
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
@@ -1116,37 +1118,6 @@ err:
return ret;
}
-static int check_directory_size(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u,
- struct bkey_s_c inode_k, bool *write_inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 new_size = 0;
- int ret;
-
- for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot),
- POS(inode_k.k->p.offset, U64_MAX),
- 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
-
- struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k);
- struct qstr name = bch2_dirent_get_name(dirent);
-
- new_size += dirent_occupied_size(&name);
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (!ret && inode_u->bi_size != new_size) {
- inode_u->bi_size = new_size;
- *write_inode = true;
- }
-
- return ret;
-}
-
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -1335,16 +1306,6 @@ static int check_inode(struct btree_trans *trans,
u.bi_journal_seq = journal_cur_seq(&c->journal);
do_update = true;
}
-
- if (S_ISDIR(u.bi_mode)) {
- ret = check_directory_size(trans, &u, k, &do_update);
-
- fsck_err_on(ret,
- trans, directory_size_mismatch,
- "directory inode %llu:%u with the mismatch directory size",
- u.bi_inum, k.k->p.snapshot);
- ret = 0;
- }
do_update:
if (do_update) {
ret = __bch2_fsck_write_inode(trans, &u);
@@ -2017,10 +1978,31 @@ fsck_err:
return ret;
}
-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w)
+{
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ darray_for_each(w->inodes, i)
+ if (fsck_err_on(i->inode.bi_size != i->i_size,
+ trans, inode_dir_wrong_nlink,
+ "directory %llu:%u with wrong i_size: got %llu, should be %llu",
+ w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) {
+ i->inode.bi_size = i->i_size;
+ ret = bch2_fsck_write_inode(trans, &i->inode);
+ if (ret)
+ break;
+ }
+fsck_err:
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
{
u32 restart_count = trans->restart_count;
return check_subdir_count_notnested(trans, w) ?:
+ check_dir_i_size_notnested(trans, w) ?:
trans_was_restarted(trans, restart_count);
}
@@ -2367,7 +2349,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto out;
if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
- ret = check_subdir_count(trans, dir);
+ ret = check_subdir_dirents_count(trans, dir);
if (ret)
goto err;
}
@@ -2457,9 +2439,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
+ if (d.v->d_type == DT_DIR)
i->count++;
+ i->i_size += bkey_bytes(d.k);
+ }
out:
err:
fsck_err:
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index d2e134528f0e..428b9be6af34 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -285,12 +285,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
+#include "rebalance.h"
+
static inline struct bch_extent_rebalance
bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
{
struct bch_io_opts io_opts;
bch2_inode_opts_get(&io_opts, c, inode);
- return io_opts_to_rebalance_opts(&io_opts);
+ return io_opts_to_rebalance_opts(c, &io_opts);
}
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 3e71860f66b9..03892388832b 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -406,11 +406,21 @@ static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
}
-static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
+void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
{
__bch2_write_op_error(out, op, op->pos.offset);
}
+static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
+ struct bch_write_op *op, u64 offset)
+{
+ bch2_inum_offset_err_msg_trans(trans, out,
+ (subvol_inum) { op->subvol, op->pos.inode, },
+ offset << 9);
+ prt_printf(out, "write error%s: ",
+ op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
+}
+
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
enum bch_data_type type,
const struct bkey_i *k,
@@ -873,7 +883,7 @@ static enum prep_encoded_ret {
if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
return PREP_ENCODED_CHECKSUM_ERR;
- if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+ if (bch2_bio_uncompress_inplace(op, bio))
return PREP_ENCODED_ERR;
}
@@ -1193,7 +1203,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
struct printbuf buf = PRINTBUF;
- __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
+ bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k));
prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
index 5400ce94ee57..b4626013abc8 100644
--- a/fs/bcachefs/io_write.h
+++ b/fs/bcachefs/io_write.h
@@ -20,6 +20,8 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *, bool);
+void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
+
#define BCH_WRITE_FLAGS() \
x(ALLOC_NOWAIT) \
x(CACHED) \
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 2cd20114b74b..24c294d4634e 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq)
static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(p->list); i++)
- INIT_LIST_HEAD(&p->list[i]);
- INIT_LIST_HEAD(&p->flushed);
+ for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
+ INIT_LIST_HEAD(&p->unflushed[i]);
+ for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
+ INIT_LIST_HEAD(&p->flushed[i]);
atomic_set(&p->count, count);
p->devs.nr = 0;
}
@@ -320,6 +319,16 @@ void bch2_journal_halt(struct journal *j)
spin_unlock(&j->lock);
}
+void bch2_journal_halt_locked(struct journal *j)
+{
+ lockdep_assert_held(&j->lock);
+
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
+ if (!j->err_seq)
+ j->err_seq = journal_cur_seq(j);
+ journal_wake(j);
+}
+
static bool journal_entry_want_write(struct journal *j)
{
bool ret = !journal_entry_is_open(j) ||
@@ -382,9 +391,12 @@ static int journal_entry_open(struct journal *j)
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
return JOURNAL_ERR_max_in_flight;
- if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX,
- c, "cannot start: journal seq overflow"))
+ if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) {
+ bch_err(c, "cannot start: journal seq overflow");
+ if (bch2_fs_emergency_read_only_locked(c))
+ bch_err(c, "fatal error - emergency read only");
return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+ }
BUG_ON(!j->cur_entry_sectors);
@@ -601,6 +613,16 @@ out:
: -BCH_ERR_journal_res_get_blocked;
}
+static unsigned max_dev_latency(struct bch_fs *c)
+{
+ u64 nsecs = 0;
+
+ for_each_rw_member(c, ca)
+ nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
+
+ return nsecs_to_jiffies(nsecs);
+}
+
/*
* Essentially the entry function to the journaling code. When bcachefs is doing
* a btree insert, it calls this function to get the current journal write.
@@ -612,17 +634,31 @@ out:
* btree node write locks.
*/
int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
- unsigned flags)
+ unsigned flags,
+ struct btree_trans *trans)
{
int ret;
if (closure_wait_event_timeout(&j->async_wait,
(ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
(flags & JOURNAL_RES_GET_NONBLOCK),
- HZ * 10))
+ HZ))
return ret;
+ if (trans)
+ bch2_trans_unlock_long(trans);
+
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10);
+
+ remaining_wait = max(0, remaining_wait - HZ);
+
+ if (closure_wait_event_timeout(&j->async_wait,
+ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ (flags & JOURNAL_RES_GET_NONBLOCK),
+ remaining_wait))
+ return ret;
+
struct printbuf buf = PRINTBUF;
bch2_journal_debug_to_text(&buf, j);
bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
@@ -727,7 +763,7 @@ recheck_need_open:
* livelock:
*/
sched_annotate_sleep();
- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
if (ret)
return ret;
@@ -760,6 +796,7 @@ recheck_need_open:
}
buf->must_flush = true;
+ j->flushing_seq = max(j->flushing_seq, seq);
if (parent && !closure_wait(&buf->wait, parent))
BUG();
@@ -848,7 +885,7 @@ out:
static int __bch2_journal_meta(struct journal *j)
{
struct journal_res res = {};
- int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL);
if (ret)
return ret;
@@ -1602,54 +1639,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
__bch2_journal_debug_to_text(out, j);
spin_unlock(&j->lock);
}
-
-bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *pin;
-
- spin_lock(&j->lock);
- if (!test_bit(JOURNAL_running, &j->flags)) {
- spin_unlock(&j->lock);
- return true;
- }
-
- *seq = max(*seq, j->pin.front);
-
- if (*seq >= j->pin.back) {
- spin_unlock(&j->lock);
- return true;
- }
-
- out->atomic++;
-
- pin_list = journal_seq_pin(j, *seq);
-
- prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
- printbuf_indent_add(out, 2);
-
- for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
- list_for_each_entry(pin, &pin_list->list[i], list)
- prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
- if (!list_empty(&pin_list->flushed))
- prt_printf(out, "flushed:\n");
-
- list_for_each_entry(pin, &pin_list->flushed, list)
- prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
- printbuf_indent_sub(out, 2);
-
- --out->atomic;
- spin_unlock(&j->lock);
-
- return false;
-}
-
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
-{
- u64 seq = 0;
-
- while (!bch2_journal_seq_pins_to_text(out, j, &seq))
- seq++;
-}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index cb0df0663946..107f7f901cd9 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -312,7 +312,7 @@ static inline void bch2_journal_res_put(struct journal *j,
}
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
- unsigned);
+ unsigned, struct btree_trans *);
/* First bits for BCH_WATERMARK: */
enum journal_res_flags {
@@ -368,7 +368,8 @@ static inline int journal_res_get_fast(struct journal *j,
}
static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
- unsigned u64s, unsigned flags)
+ unsigned u64s, unsigned flags,
+ struct btree_trans *trans)
{
int ret;
@@ -380,7 +381,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
if (journal_res_get_fast(j, res, flags))
goto out;
- ret = bch2_journal_res_get_slowpath(j, res, flags);
+ ret = bch2_journal_res_get_slowpath(j, res, flags, trans);
if (ret)
return ret;
out:
@@ -408,6 +409,7 @@ bool bch2_journal_noflush_seq(struct journal *, u64, u64);
int bch2_journal_meta(struct journal *);
void bch2_journal_halt(struct journal *);
+void bch2_journal_halt_locked(struct journal *);
static inline int bch2_journal_error(struct journal *j)
{
@@ -429,8 +431,6 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
-bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7f2efe85a805..11c39e0c34f4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,6 +17,7 @@
#include "sb-clean.h"
#include "trace.h"
+#include <linux/ioprio.h>
#include <linux/string_choices.h>
void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
@@ -1763,6 +1764,7 @@ static CLOSURE_CALLBACK(journal_write_submit)
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0);
BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
ca->prev_journal_sector = bio->bi_iter.bi_sector;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 3c8242606da7..d373cd181a7f 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -327,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j)
popped = true;
}
- if (popped)
+ if (popped) {
bch2_journal_space_available(j);
+ __closure_wake_up(&j->reclaim_flush_wait);
+ }
}
bool __bch2_journal_pin_put(struct journal *j, u64 seq)
@@ -362,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j,
pin->seq = 0;
list_del_init(&pin->list);
+ if (j->reclaim_flush_wait.list.first)
+ __closure_wake_up(&j->reclaim_flush_wait);
+
/*
* Unpinning a journal entry may make journal_next_bucket() succeed, if
* writing a new last_seq will now make another bucket available:
@@ -379,15 +384,19 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
-static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
+ journal_pin_flush_fn fn)
{
if (fn == bch2_btree_node_flush0 ||
- fn == bch2_btree_node_flush1)
- return JOURNAL_PIN_btree;
- else if (fn == bch2_btree_key_cache_journal_flush)
- return JOURNAL_PIN_key_cache;
+ fn == bch2_btree_node_flush1) {
+ unsigned idx = fn == bch2_btree_node_flush1;
+ struct btree *b = container_of(pin, struct btree, writes[idx].journal);
+
+ return JOURNAL_PIN_TYPE_btree0 - b->c.level;
+ } else if (fn == bch2_btree_key_cache_journal_flush)
+ return JOURNAL_PIN_TYPE_key_cache;
else
- return JOURNAL_PIN_other;
+ return JOURNAL_PIN_TYPE_other;
}
static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
@@ -406,7 +415,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
atomic_inc(&pin_list->count);
pin->seq = seq;
pin->flush = flush_fn;
- list_add(&pin->list, &pin_list->list[type]);
+
+ if (list_empty(&pin_list->unflushed[type]) &&
+ j->reclaim_flush_wait.list.first)
+ __closure_wake_up(&j->reclaim_flush_wait);
+
+ list_add(&pin->list, &pin_list->unflushed[type]);
}
void bch2_journal_pin_copy(struct journal *j,
@@ -431,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j,
bool reclaim = __journal_pin_drop(j, dst);
- bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
+ bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@@ -455,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
bool reclaim = __journal_pin_drop(j, pin);
- bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
+ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@@ -499,16 +513,15 @@ journal_get_next_pin(struct journal *j,
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
- unsigned i;
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
if (*seq > seq_to_flush && !allowed_above_seq)
break;
- for (i = 0; i < JOURNAL_PIN_NR; i++)
- if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
- ((1U << i) & allowed_above_seq)) {
- ret = list_first_entry_or_null(&pin_list->list[i],
+ for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
+ if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+ (BIT(i) & allowed_above_seq)) {
+ ret = list_first_entry_or_null(&pin_list->unflushed[i],
struct journal_entry_pin, list);
if (ret)
return ret;
@@ -544,8 +557,8 @@ static size_t journal_flush_pins(struct journal *j,
}
if (min_key_cache) {
- allowed_above |= 1U << JOURNAL_PIN_key_cache;
- allowed_below |= 1U << JOURNAL_PIN_key_cache;
+ allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
+ allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
}
cond_resched();
@@ -553,7 +566,9 @@ static size_t journal_flush_pins(struct journal *j,
j->last_flushed = jiffies;
spin_lock(&j->lock);
- pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
+ pin = journal_get_next_pin(j, seq_to_flush,
+ allowed_below,
+ allowed_above, &seq);
if (pin) {
BUG_ON(j->flush_in_progress);
j->flush_in_progress = pin;
@@ -576,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j,
spin_lock(&j->lock);
/* Pin might have been dropped or rearmed: */
if (likely(!err && !j->flush_in_progress_dropped))
- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
+ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
j->flush_in_progress = NULL;
j->flush_in_progress_dropped = false;
spin_unlock(&j->lock);
@@ -816,10 +831,41 @@ int bch2_journal_reclaim_start(struct journal *j)
return 0;
}
+static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
+ unsigned types)
+{
+ struct journal_entry_pin_list *pin_list;
+ u64 seq;
+
+ spin_lock(&j->lock);
+ fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
+ if (seq > seq_to_flush)
+ break;
+
+ for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
+ if ((BIT(i) & types) &&
+ (!list_empty(&pin_list->unflushed[i]) ||
+ !list_empty(&pin_list->flushed[i]))) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+ }
+ spin_unlock(&j->lock);
+
+ return false;
+}
+
+static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
+ unsigned types)
+{
+ return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
+ journal_pins_still_flushing(j, seq_to_flush, types);
+}
+
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
bool *did_work)
{
- int ret;
+ int ret = 0;
ret = bch2_journal_error(j);
if (ret)
@@ -827,12 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- if (journal_flush_pins(j, seq_to_flush,
- (1U << JOURNAL_PIN_key_cache)|
- (1U << JOURNAL_PIN_other), 0, 0, 0) ||
- journal_flush_pins(j, seq_to_flush,
- (1U << JOURNAL_PIN_btree), 0, 0, 0))
- *did_work = true;
+ for (int type = JOURNAL_PIN_TYPE_NR - 1;
+ type >= 0;
+ --type)
+ if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
+ *did_work = true;
+ goto unlock;
+ }
if (seq_to_flush > journal_cur_seq(j))
bch2_journal_entry_close(j);
@@ -847,6 +894,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
!fifo_used(&j->pin);
spin_unlock(&j->lock);
+unlock:
mutex_unlock(&j->reclaim_lock);
return ret;
@@ -860,7 +908,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
if (!test_bit(JOURNAL_running, &j->flags))
return false;
- closure_wait_event(&j->async_wait,
+ closure_wait_event(&j->reclaim_flush_wait,
journal_flush_done(j, seq_to_flush, &did_work));
return did_work;
@@ -926,3 +974,54 @@ err:
return ret;
}
+
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
+{
+ struct journal_entry_pin_list *pin_list;
+ struct journal_entry_pin *pin;
+
+ spin_lock(&j->lock);
+ if (!test_bit(JOURNAL_running, &j->flags)) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
+ *seq = max(*seq, j->pin.front);
+
+ if (*seq >= j->pin.back) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
+ out->atomic++;
+
+ pin_list = journal_seq_pin(j, *seq);
+
+ prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "unflushed:\n");
+ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
+ list_for_each_entry(pin, &pin_list->unflushed[i], list)
+ prt_printf(out, "\t%px %ps\n", pin, pin->flush);
+
+ prt_printf(out, "flushed:\n");
+ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
+ list_for_each_entry(pin, &pin_list->flushed[i], list)
+ prt_printf(out, "\t%px %ps\n", pin, pin->flush);
+
+ printbuf_indent_sub(out, 2);
+
+ --out->atomic;
+ spin_unlock(&j->lock);
+
+ return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+ u64 seq = 0;
+
+ while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+ seq++;
+}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index ec84c3345281..0a73d7134e1c 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j)
int bch2_journal_flush_device_pins(struct journal *, int);
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
+
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index e9bd716fbb71..1ef3a28ed6ab 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -53,15 +53,18 @@ struct journal_buf {
*/
enum journal_pin_type {
- JOURNAL_PIN_btree,
- JOURNAL_PIN_key_cache,
- JOURNAL_PIN_other,
- JOURNAL_PIN_NR,
+ JOURNAL_PIN_TYPE_btree3,
+ JOURNAL_PIN_TYPE_btree2,
+ JOURNAL_PIN_TYPE_btree1,
+ JOURNAL_PIN_TYPE_btree0,
+ JOURNAL_PIN_TYPE_key_cache,
+ JOURNAL_PIN_TYPE_other,
+ JOURNAL_PIN_TYPE_NR,
};
struct journal_entry_pin_list {
- struct list_head list[JOURNAL_PIN_NR];
- struct list_head flushed;
+ struct list_head unflushed[JOURNAL_PIN_TYPE_NR];
+ struct list_head flushed[JOURNAL_PIN_TYPE_NR];
atomic_t count;
struct bch_devs_list devs;
};
@@ -226,6 +229,7 @@ struct journal {
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
struct closure_waitlist async_wait;
+ struct closure_waitlist reclaim_flush_wait;
struct delayed_work write_work;
struct workqueue_struct *wq;
@@ -236,6 +240,7 @@ struct journal {
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
u64 flushed_seq_ondisk;
+ u64 flushing_seq;
u64 last_seq_ondisk;
u64 err_seq;
u64 last_empty_seq;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 85c361e78ba5..21805509ab9e 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -215,7 +215,8 @@ static int bch2_copygc(struct moving_context *ctxt,
};
move_buckets buckets = { 0 };
struct move_bucket_in_flight *f;
- u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
+ u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen);
+ u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
@@ -245,7 +246,6 @@ static int bch2_copygc(struct moving_context *ctxt,
*did_work = true;
}
err:
- darray_exit(&buckets);
/* no entries in LRU btree found, or got to end: */
if (bch2_err_matches(ret, ENOENT))
@@ -254,8 +254,11 @@ err:
if (ret < 0 && !bch2_err_matches(ret, EROFS))
bch_err_msg(c, ret, "from bch2_move_data()");
- moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
- trace_and_count(c, copygc, c, moved, 0, 0, 0);
+ sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
+ sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
+ trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved);
+
+ darray_exit(&buckets);
return ret;
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index e763d52e0f38..9d397fc2a1f0 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -476,13 +476,13 @@ enum fsck_err_opts {
NULL, "Enable nocow mode: enables runtime locking in\n"\
"data move path needed if nocow will ever be in use\n")\
x(copygc_enabled, u8, \
- OPT_FS|OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Enable copygc: disable for debugging, or to\n"\
"quiet the system when doing performance testing\n")\
x(rebalance_enabled, u8, \
- OPT_FS|OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Enable rebalance: disable for debugging, or to\n"\
@@ -659,18 +659,4 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
-/* rebalance opts: */
-
-static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts)
-{
- return (struct bch_extent_rebalance) {
- .type = BIT(BCH_EXTENT_ENTRY_rebalance),
-#define x(_name) \
- ._name = opts->_name, \
- ._name##_from_inode = opts->_name##_from_inode,
- BCH_REBALANCE_OPTS()
-#undef x
- };
-};
-
#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 4adc74cd3f70..d0a1f5cd5c2b 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -121,12 +121,10 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
}
incompressible:
- if (opts->background_target &&
- bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) {
+ if (opts->background_target)
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
sectors += p.crc.compressed_size;
- }
return sectors;
}
@@ -140,7 +138,7 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opt
const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
- struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts);
+ struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
return old == NULL || memcmp(old, &new, sizeof(new));
} else {
return old != NULL;
@@ -163,7 +161,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
k.k->u64s += sizeof(*old) / sizeof(u64);
}
- *old = io_opts_to_rebalance_opts(opts);
+ *old = io_opts_to_rebalance_opts(c, opts);
} else {
if (old)
extent_entry_drop(k, (union bch_extent_entry *) old);
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 0a0821ab895d..62a3859d3823 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -4,8 +4,28 @@
#include "compress.h"
#include "disk_groups.h"
+#include "opts.h"
#include "rebalance_types.h"
+static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
+ struct bch_io_opts *opts)
+{
+ struct bch_extent_rebalance r = {
+ .type = BIT(BCH_EXTENT_ENTRY_rebalance),
+#define x(_name) \
+ ._name = opts->_name, \
+ ._name##_from_inode = opts->_name##_from_inode,
+ BCH_REBALANCE_OPTS()
+#undef x
+ };
+
+ if (r.background_target &&
+ !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
+ r.background_target = 0;
+
+ return r;
+};
+
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
int bch2_get_update_rebalance_opts(struct btree_trans *,
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 98825437381c..71c786cdb192 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -32,7 +32,6 @@
#include <linux/sort.h>
#include <linux/stat.h>
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
{
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 93ba4f4e47ca..441e648f28b5 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -172,7 +172,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
bool should_commit)
{
if (REFLINK_P_ERROR(p.v))
- return -BCH_ERR_missing_indirect_extent;
+ return 0;
struct bch_fs *c = trans->c;
u64 live_start = REFLINK_P_IDX(p.v);
@@ -259,8 +259,6 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
return k;
if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
- bch2_trans_iter_exit(trans, iter);
-
unsigned size = min((u64) k.k->size,
REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) -
reflink_offset);
@@ -268,14 +266,16 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
k.k->p.offset, should_commit);
- if (ret)
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
return bkey_s_c_err(ret);
+ }
} else if (unlikely(REFLINK_P_ERROR(p.v))) {
- bch2_trans_iter_exit(trans, iter);
-
int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
- if (ret)
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
return bkey_s_c_err(ret);
+ }
}
*offset_into_extent = reflink_offset - bkey_start_offset(k.k);
@@ -300,7 +300,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
if (ret)
return ret;
- if (bkey_deleted(k.k)) {
+ if (!bkey_refcount_c(k)) {
if (!(flags & BTREE_TRIGGER_overwrite))
ret = -BCH_ERR_missing_indirect_extent;
goto next;
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 14f6b6a5fb38..35e07bc8fbd3 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -92,7 +92,7 @@
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
BCH_FSCK_ERR_accounting_key_junk_at_end) \
x(directory_size, \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
+ BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
BCH_FSCK_ERR_directory_size_mismatch) \
#define DOWNGRADE_TABLE() \
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 0b4fe899209b..b86ec013d7d7 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -57,7 +57,7 @@ enum bch_fsck_flags {
x(bset_wrong_sector_offset, 44, 0) \
x(bset_empty, 45, 0) \
x(bset_bad_seq, 46, 0) \
- x(bset_blacklisted_journal_seq, 47, 0) \
+ x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \
x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \
x(btree_node_bad_btree, 49, 0) \
x(btree_node_bad_level, 50, 0) \
@@ -180,9 +180,9 @@ enum bch_fsck_flags {
x(ptr_crc_nonce_mismatch, 162, 0) \
x(ptr_stripe_redundant, 163, 0) \
x(reservation_key_nr_replicas_invalid, 164, 0) \
- x(reflink_v_refcount_wrong, 165, 0) \
+ x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
x(reflink_v_pos_bad, 292, 0) \
- x(reflink_p_to_missing_reflink_v, 166, 0) \
+ x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \
x(reflink_refcount_underflow, 293, 0) \
x(stripe_pos_bad, 167, 0) \
x(stripe_val_size_bad, 168, 0) \
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index 8c2c5539de2e..d78451c2a0c6 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -31,11 +31,11 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir
}
}
-static int fsck_rename_dirent(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c_dirent old)
+static noinline int fsck_rename_dirent(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c_dirent old)
{
struct qstr old_name = bch2_dirent_get_name(old);
struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
@@ -71,11 +71,11 @@ static int fsck_rename_dirent(struct btree_trans *trans,
return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
}
-static int hash_pick_winner(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c k1,
- struct bkey_s_c k2)
+static noinline int hash_pick_winner(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c k1,
+ struct bkey_s_c k2)
{
if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
!memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
@@ -142,8 +142,8 @@ fsck_err:
* All versions of the same inode in different snapshots must have the same hash
* seed/type: verify that the hash info we're using matches the root
*/
-static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
- struct bch_hash_info *hash_info)
+static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
+ struct bch_hash_info *hash_info)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index e3d0475232e5..b7b96283c316 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -428,7 +428,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
bch2_bkey_get_iter_typed(trans, &snapshot_iter,
BTREE_ID_snapshots, POS(0, snapid),
0, snapshot);
- ret = bkey_err(subvol);
+ ret = bkey_err(snapshot);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
"missing snapshot %u", snapid);
if (ret)
@@ -440,6 +440,11 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
BTREE_ID_snapshot_trees, POS(0, treeid),
0, snapshot_tree);
+ ret = bkey_err(snapshot_tree);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+ "missing snapshot tree %u", treeid);
+ if (ret)
+ goto err;
if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
struct bkey_i_snapshot_tree *snapshot_tree_mut =
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d97ea7bd1171..6d97d412fed9 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -411,6 +411,17 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
return ret;
}
+bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
+{
+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
+
+ bch2_journal_halt_locked(&c->journal);
+ bch2_fs_read_only_async(c);
+
+ wake_up(&bch2_read_only_wait);
+ return ret;
+}
+
static int bch2_fs_read_write_late(struct bch_fs *c)
{
int ret;
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index fa6d52216510..04f8287eff5c 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -29,6 +29,7 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
bool bch2_fs_emergency_read_only(struct bch_fs *);
+bool bch2_fs_emergency_read_only_locked(struct bch_fs *);
void bch2_fs_read_only(struct bch_fs *);
int bch2_fs_read_write(struct bch_fs *);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 9d40b7d4ea29..c1b51009edf6 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -727,7 +727,7 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail,
TP_ARGS(c, str)
);
-TRACE_EVENT(discard_buckets,
+DECLARE_EVENT_CLASS(discard_buckets_class,
TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
u64 need_journal_commit, u64 discarded, const char *err),
TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
@@ -759,6 +759,18 @@ TRACE_EVENT(discard_buckets,
__entry->err)
);
+DEFINE_EVENT(discard_buckets_class, discard_buckets,
+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+ u64 need_journal_commit, u64 discarded, const char *err),
+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
+);
+
+DEFINE_EVENT(discard_buckets_class, discard_buckets_fast,
+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+ u64 need_journal_commit, u64 discarded, const char *err),
+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
+);
+
TRACE_EVENT(bucket_invalidate,
TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
TP_ARGS(c, dev, bucket, sectors),
@@ -902,32 +914,30 @@ TRACE_EVENT(evacuate_bucket,
TRACE_EVENT(copygc,
TP_PROTO(struct bch_fs *c,
- u64 sectors_moved, u64 sectors_not_moved,
- u64 buckets_moved, u64 buckets_not_moved),
- TP_ARGS(c,
- sectors_moved, sectors_not_moved,
- buckets_moved, buckets_not_moved),
+ u64 buckets,
+ u64 sectors_seen,
+ u64 sectors_moved),
+ TP_ARGS(c, buckets, sectors_seen, sectors_moved),
TP_STRUCT__entry(
__field(dev_t, dev )
+ __field(u64, buckets )
+ __field(u64, sectors_seen )
__field(u64, sectors_moved )
- __field(u64, sectors_not_moved )
- __field(u64, buckets_moved )
- __field(u64, buckets_not_moved )
),
TP_fast_assign(
__entry->dev = c->dev;
+ __entry->buckets = buckets;
+ __entry->sectors_seen = sectors_seen;
__entry->sectors_moved = sectors_moved;
- __entry->sectors_not_moved = sectors_not_moved;
- __entry->buckets_moved = buckets_moved;
- __entry->buckets_not_moved = buckets_moved;
),
- TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
+ TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->sectors_moved, __entry->sectors_not_moved,
- __entry->buckets_moved, __entry->buckets_not_moved)
+ __entry->buckets,
+ __entry->sectors_seen,
+ __entry->sectors_moved)
);
TRACE_EVENT(copygc_wait,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 1a1720116071..e7c3541b38f3 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -670,8 +670,6 @@ static inline int cmp_le32(__le32 l, __le32 r)
#include <linux/uuid.h>
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
static inline bool qstr_eq(const struct qstr l, const struct qstr r)
{
return l.len == r.len && !memcmp(l.name, r.name, l.len);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 92071ca0655f..3dc5a35dd19b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1496,6 +1496,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (!p->skip_locking) {
btrfs_unlock_up_safe(p, parent_level + 1);
+ btrfs_maybe_reset_lockdep_class(root, tmp);
tmp_locked = true;
btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
@@ -1539,6 +1540,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (!p->skip_locking) {
ASSERT(ret == -EAGAIN);
+ btrfs_maybe_reset_lockdep_class(root, tmp);
tmp_locked = true;
btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a2cac9d0a1a9..b2fae67f8fa3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -523,8 +523,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
u64 end;
u32 len;
- /* For now only order 0 folios are supported for data. */
- ASSERT(folio_order(folio) == 0);
btrfs_debug(fs_info,
"%s: bi_sector=%llu, err=%d, mirror=%u",
__func__, bio->bi_iter.bi_sector, bio->bi_status,
@@ -552,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> folio_shift(folio);
/*
* Zero out the remaining part if this range straddles
@@ -561,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* Here we should only zero the range inside the folio,
* not touch anything else.
*
- * NOTE: i_size is exclusive while end is inclusive.
+ * NOTE: i_size is exclusive while end is inclusive and
+ * folio_contains() takes PAGE_SIZE units.
*/
- if (folio_index(folio) == end_index && i_size <= end) {
+ if (folio_contains(folio, i_size >> PAGE_SHIFT) &&
+ i_size <= end) {
u32 zero_start = max(offset_in_folio(folio, i_size),
offset_in_folio(folio, start));
u32 zero_len = offset_in_folio(folio, end) + 1 -
@@ -899,7 +898,6 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode,
u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
- struct extent_state *cached_state = NULL;
ASSERT(em_cached);
@@ -915,14 +913,12 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode,
*em_cached = NULL;
}
- btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state);
em = btrfs_get_extent(inode, folio, start, len);
if (!IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
}
- unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state);
return em;
}
@@ -956,7 +952,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
return ret;
}
- if (folio->index == last_byte >> folio_shift(folio)) {
+ if (folio_contains(folio, last_byte >> PAGE_SHIFT)) {
size_t zero_offset = offset_in_folio(folio, last_byte);
if (zero_offset) {
@@ -1079,11 +1075,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
int btrfs_read_folio(struct file *file, struct folio *folio)
{
+ struct btrfs_inode *inode = folio_to_inode(folio);
+ const u64 start = folio_pos(folio);
+ const u64 end = start + folio_size(folio) - 1;
+ struct extent_state *cached_state = NULL;
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
struct extent_map *em_cached = NULL;
int ret;
+ btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state);
ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
+
free_extent_map(em_cached);
/*
@@ -2380,12 +2383,20 @@ void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
struct folio *folio;
+ struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
+ const u64 start = readahead_pos(rac);
+ const u64 end = start + readahead_length(rac) - 1;
+ struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
+ btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state);
+
while ((folio = readahead_folio(rac)) != NULL)
btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
+
if (em_cached)
free_extent_map(em_cached);
submit_one_bio(&bio_ctrl);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 36f51c311bb1..ed3c0d6546c5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1039,7 +1039,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
loff_t pos = iocb->ki_pos;
int ret;
loff_t oldsize;
- loff_t start_pos;
/*
* Quickly bail out on NOWAIT writes if we don't have the nodatacow or
@@ -1066,9 +1065,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
inode_inc_iversion(inode);
}
- start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
- if (start_pos > oldsize) {
+ if (pos > oldsize) {
/* Expand hole size to cover write data, preventing empty gap */
loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 30eceaf829a7..4aca7475fd82 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1229,6 +1229,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
*/
if (WARN_ON_ONCE(len >= ordered->num_bytes))
return ERR_PTR(-EINVAL);
+ /*
+ * If our ordered extent had an error there's no point in continuing.
+ * The error may have come from a transaction abort done either by this
+ * task or some other concurrent task, and the transaction abort path
+ * iterates over all existing ordered extents and sets the flag
+ * BTRFS_ORDERED_IOERR on them.
+ */
+ if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) {
+ const int fs_error = BTRFS_FS_ERROR(fs_info);
+
+ return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO);
+ }
/* We cannot split partially completed ordered extents. */
if (ordered->bytes_left) {
ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b90fabe302e6..f9d3766c809b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1880,11 +1880,7 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su
* Commit current transaction to make sure all the rfer/excl numbers
* get updated.
*/
- trans = btrfs_start_transaction(fs_info->quota_root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(fs_info->quota_root);
if (ret < 0)
return ret;
@@ -1897,8 +1893,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su
/*
* It's squota and the subvolume still has numbers needed for future
* accounting, in this case we can not delete it. Just skip it.
+ *
+ * Or the qgroup is already removed by a qgroup rescan. For both cases we're
+ * safe to ignore them.
*/
- if (ret == -EBUSY)
+ if (ret == -EBUSY || ret == -ENOENT)
ret = 0;
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 15312013f2a3..aca83a98b75a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -274,8 +274,10 @@ loop:
cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (TRANS_ABORTED(cur_trans)) {
+ const int abort_error = cur_trans->aborted;
+
spin_unlock(&fs_info->trans_lock);
- return cur_trans->aborted;
+ return abort_error;
}
if (btrfs_blocked_trans_types[cur_trans->state] & type) {
spin_unlock(&fs_info->trans_lock);
diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c
index 1715d5ca2b2d..e341ade47dd8 100644
--- a/fs/cachefiles/error_inject.c
+++ b/fs/cachefiles/error_inject.c
@@ -11,7 +11,7 @@
unsigned int cachefiles_error_injection_state;
static struct ctl_table_header *cachefiles_sysctl;
-static struct ctl_table cachefiles_sysctls[] = {
+static const struct ctl_table cachefiles_sysctls[] = {
{
.procname = "error_injection",
.data = &cachefiles_error_injection_state,
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fdf9dc15eafa..fdd404fc8112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -412,7 +412,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
- char name[100];
+ char name[NAME_MAX];
doutc(fsc->client, "begin\n");
fsc->debugfs_congestion_kb =
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0bf388e07a02..62e99e65250d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1940,29 +1940,19 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
/*
* Check if cached dentry can be trusted.
*/
-static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int ceph_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
struct ceph_client *cl = mdsc->fsc->client;
int valid = 0;
- struct dentry *parent;
- struct inode *dir, *inode;
+ struct inode *inode;
- valid = fscrypt_d_revalidate(dentry, flags);
+ valid = fscrypt_d_revalidate(dir, name, dentry, flags);
if (valid <= 0)
return valid;
- if (flags & LOOKUP_RCU) {
- parent = READ_ONCE(dentry->d_parent);
- dir = d_inode_rcu(parent);
- if (!dir)
- return -ECHILD;
- inode = d_inode_rcu(dentry);
- } else {
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- inode = d_inode(dentry);
- }
+ inode = d_inode_rcu(dentry);
doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n",
dentry, dentry, inode, ceph_dentry(dentry)->offset,
@@ -2008,6 +1998,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
req->r_parent = dir;
ihold(dir);
+ req->r_dname = name;
+
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
mask |= CEPH_CAP_XATTR_SHARED;
@@ -2038,9 +2030,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid");
if (!valid)
ceph_dir_clear_complete(dir);
-
- if (!(flags & LOOKUP_RCU))
- dput(parent);
return valid;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 785fe489ef4b..54b3421501e9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2621,6 +2621,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
{
struct inode *dir = req->r_parent;
struct dentry *dentry = req->r_dentry;
+ const struct qstr *name = req->r_dname;
u8 *cryptbuf = NULL;
u32 len = 0;
int ret = 0;
@@ -2641,8 +2642,10 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
if (!fscrypt_has_encryption_key(dir))
goto success;
- if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX,
- &len)) {
+ if (!name)
+ name = &dentry->d_name;
+
+ if (!fscrypt_fname_encrypted_size(dir, name->len, NAME_MAX, &len)) {
WARN_ON_ONCE(1);
return ERR_PTR(-ENAMETOOLONG);
}
@@ -2657,7 +2660,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
if (!cryptbuf)
return ERR_PTR(-ENOMEM);
- ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len);
+ ret = fscrypt_fname_encrypt(dir, name, cryptbuf, len);
if (ret) {
kfree(cryptbuf);
return ERR_PTR(ret);
@@ -2945,12 +2948,12 @@ static struct ceph_mds_request_head_legacy *
find_legacy_request_head(void *p, u64 features)
{
bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
- struct ceph_mds_request_head_old *ohead;
+ struct ceph_mds_request_head *head;
if (legacy)
return (struct ceph_mds_request_head_legacy *)p;
- ohead = (struct ceph_mds_request_head_old *)p;
- return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
+ head = (struct ceph_mds_request_head *)p;
+ return (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid;
}
/*
@@ -3020,7 +3023,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
if (legacy)
len = sizeof(struct ceph_mds_request_head_legacy);
else if (request_head_version == 1)
- len = sizeof(struct ceph_mds_request_head_old);
+ len = offsetofend(struct ceph_mds_request_head, args);
else if (request_head_version == 2)
len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
else
@@ -3104,11 +3107,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
msg->hdr.version = cpu_to_le16(3);
p = msg->front.iov_base + sizeof(*lhead);
} else if (request_head_version == 1) {
- struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
+ struct ceph_mds_request_head *nhead = msg->front.iov_base;
msg->hdr.version = cpu_to_le16(4);
- ohead->version = cpu_to_le16(1);
- p = msg->front.iov_base + sizeof(*ohead);
+ nhead->version = cpu_to_le16(1);
+ p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, args);
} else if (request_head_version == 2) {
struct ceph_mds_request_head *nhead = msg->front.iov_base;
@@ -3265,7 +3268,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
* so we limit to retry at most 256 times.
*/
if (req->r_attempts) {
- old_max_retry = sizeof_field(struct ceph_mds_request_head_old,
+ old_max_retry = sizeof_field(struct ceph_mds_request_head,
num_retry);
old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
if ((old_version && req->r_attempts >= old_max_retry) ||
@@ -5690,18 +5693,18 @@ static int ceph_mds_auth_match(struct ceph_mds_client *mdsc,
*
* All the other cases --> mismatch
*/
+ bool path_matched = true;
char *first = strstr(_tpath, auth->match.path);
- if (first != _tpath) {
- if (free_tpath)
- kfree(_tpath);
- return 0;
+ if (first != _tpath ||
+ (tlen > len && _tpath[len] != '/')) {
+ path_matched = false;
}
- if (tlen > len && _tpath[len] != '/') {
- if (free_tpath)
- kfree(_tpath);
+ if (free_tpath)
+ kfree(_tpath);
+
+ if (!path_matched)
return 0;
- }
}
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 38bb7e0d2d79..7c9fee9e80d4 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -299,6 +299,8 @@ struct ceph_mds_request {
struct inode *r_target_inode; /* resulting inode */
struct inode *r_new_inode; /* new inode (for creates) */
+ const struct qstr *r_dname; /* stable name (for ->d_revalidate) */
+
#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */
#define CEPH_MDS_R_ABORTED (2) /* call was aborted */
#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4e552ba7bd43..a3e2dfeedfbf 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -445,7 +445,8 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
}
/* called when a cache lookup succeeds */
-static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
+static int coda_dentry_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *de, unsigned int flags)
{
struct inode *inode;
struct coda_inode_info *cii;
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 9f2d5743e2c8..0df46f09b6cc 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -14,7 +14,7 @@
static struct ctl_table_header *fs_table_header;
-static struct ctl_table coda_table[] = {
+static const struct ctl_table coda_table[] = {
{
.procname = "timeout",
.data = &coda_timeout,
diff --git a/fs/coredump.c b/fs/coredump.c
index d48edb37bc35..591700e1b2ce 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -995,7 +995,7 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write,
static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT;
static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX;
-static struct ctl_table coredump_sysctls[] = {
+static const struct ctl_table coredump_sysctls[] = {
{
.procname = "core_uses_pid",
.data = &core_uses_pid,
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 0ad52fbe51c9..010f9c0a4c2f 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -574,11 +574,10 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash);
* Validate dentries in encrypted directories to make sure we aren't potentially
* caching stale dentries after a key has been added.
*/
-int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
- struct dentry *dir;
int err;
- int valid;
/*
* Plaintext names are always valid, since fscrypt doesn't support
@@ -591,30 +590,21 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
/*
* No-key name; valid if the directory's key is still unavailable.
*
- * Although fscrypt forbids rename() on no-key names, we still must use
- * dget_parent() here rather than use ->d_parent directly. That's
- * because a corrupted fs image may contain directory hard links, which
- * the VFS handles by moving the directory's dentry tree in the dcache
- * each time ->lookup() finds the directory and it already has a dentry
- * elsewhere. Thus ->d_parent can be changing, and we must safely grab
- * a reference to some ->d_parent to prevent it from being freed.
+ * Note in RCU mode we have to bail if we get here -
+ * fscrypt_get_encryption_info() may block.
*/
if (flags & LOOKUP_RCU)
return -ECHILD;
- dir = dget_parent(dentry);
/*
* Pass allow_unsupported=true, so that files with an unsupported
* encryption policy can be deleted.
*/
- err = fscrypt_get_encryption_info(d_inode(dir), true);
- valid = !fscrypt_has_encryption_key(d_inode(dir));
- dput(dir);
-
+ err = fscrypt_get_encryption_info(dir, true);
if (err < 0)
return err;
- return valid;
+ return !fscrypt_has_encryption_key(dir);
}
EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
diff --git a/fs/dcache.c b/fs/dcache.c
index 1a01d7a6a7a9..e3634916ffb9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -192,7 +192,7 @@ static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
-static struct ctl_table fs_dcache_sysctls[] = {
+static const struct ctl_table fs_dcache_sysctls[] = {
{
.procname = "dentry-state",
.data = &dentry_stat,
@@ -295,12 +295,16 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
return dentry_string_cmp(cs, ct, tcount);
}
+/*
+ * long names are allocated separately from dentry and never modified.
+ * Refcounted, freeing is RCU-delayed. See take_dentry_name_snapshot()
+ * for the reason why ->count and ->head can't be combined into a union.
+ * dentry_string_cmp() relies upon ->name[] being word-aligned.
+ */
struct external_name {
- union {
- atomic_t count;
- struct rcu_head head;
- } u;
- unsigned char name[];
+ atomic_t count;
+ struct rcu_head head;
+ unsigned char name[] __aligned(sizeof(unsigned long));
};
static inline struct external_name *external_name(struct dentry *dentry)
@@ -324,31 +328,45 @@ static void __d_free_external(struct rcu_head *head)
static inline int dname_external(const struct dentry *dentry)
{
- return dentry->d_name.name != dentry->d_iname;
+ return dentry->d_name.name != dentry->d_shortname.string;
}
void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
{
- spin_lock(&dentry->d_lock);
- name->name = dentry->d_name;
- if (unlikely(dname_external(dentry))) {
- atomic_inc(&external_name(dentry)->u.count);
+ unsigned seq;
+ const unsigned char *s;
+
+ rcu_read_lock();
+retry:
+ seq = read_seqcount_begin(&dentry->d_seq);
+ s = READ_ONCE(dentry->d_name.name);
+ name->name.hash_len = dentry->d_name.hash_len;
+ name->name.name = name->inline_name.string;
+ if (likely(s == dentry->d_shortname.string)) {
+ name->inline_name = dentry->d_shortname;
} else {
- memcpy(name->inline_name, dentry->d_iname,
- dentry->d_name.len + 1);
- name->name.name = name->inline_name;
+ struct external_name *p;
+ p = container_of(s, struct external_name, name[0]);
+ // get a valid reference
+ if (unlikely(!atomic_inc_not_zero(&p->count)))
+ goto retry;
+ name->name.name = s;
}
- spin_unlock(&dentry->d_lock);
+ if (read_seqcount_retry(&dentry->d_seq, seq)) {
+ release_dentry_name_snapshot(name);
+ goto retry;
+ }
+ rcu_read_unlock();
}
EXPORT_SYMBOL(take_dentry_name_snapshot);
void release_dentry_name_snapshot(struct name_snapshot *name)
{
- if (unlikely(name->name.name != name->inline_name)) {
+ if (unlikely(name->name.name != name->inline_name.string)) {
struct external_name *p;
p = container_of(name->name.name, struct external_name, name[0]);
- if (unlikely(atomic_dec_and_test(&p->u.count)))
- kfree_rcu(p, u.head);
+ if (unlikely(atomic_dec_and_test(&p->count)))
+ kfree_rcu(p, head);
}
}
EXPORT_SYMBOL(release_dentry_name_snapshot);
@@ -386,7 +404,7 @@ static void dentry_free(struct dentry *dentry)
WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
if (unlikely(dname_external(dentry))) {
struct external_name *p = external_name(dentry);
- if (likely(atomic_dec_and_test(&p->u.count))) {
+ if (likely(atomic_dec_and_test(&p->count))) {
call_rcu(&dentry->d_u.d_rcu, __d_free_external);
return;
}
@@ -1654,10 +1672,10 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
* will still always have a NUL at the end, even if we might
* be overwriting an internal NUL character
*/
- dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
+ dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0;
if (unlikely(!name)) {
name = &slash_name;
- dname = dentry->d_iname;
+ dname = dentry->d_shortname.string;
} else if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
struct external_name *p = kmalloc(size + name->len,
@@ -1667,10 +1685,10 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
- atomic_set(&p->u.count, 1);
+ atomic_set(&p->count, 1);
dname = p->name;
} else {
- dname = dentry->d_iname;
+ dname = dentry->d_shortname.string;
}
dentry->d_name.len = name->len;
@@ -1682,7 +1700,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
smp_store_release(&dentry->d_name.name, dname); /* ^^^ */
dentry->d_flags = 0;
- lockref_init(&dentry->d_lockref, 1);
+ lockref_init(&dentry->d_lockref);
seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
dentry->d_inode = NULL;
dentry->d_parent = dentry;
@@ -2728,10 +2746,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
* dentry:internal, target:external. Steal target's
* storage and make target internal.
*/
- memcpy(target->d_iname, dentry->d_name.name,
- dentry->d_name.len + 1);
dentry->d_name.name = target->d_name.name;
- target->d_name.name = target->d_iname;
+ target->d_shortname = dentry->d_shortname;
+ target->d_name.name = target->d_shortname.string;
}
} else {
if (unlikely(dname_external(dentry))) {
@@ -2739,20 +2756,16 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
* dentry:external, target:internal. Give dentry's
* storage to target and make dentry internal
*/
- memcpy(dentry->d_iname, target->d_name.name,
- target->d_name.len + 1);
target->d_name.name = dentry->d_name.name;
- dentry->d_name.name = dentry->d_iname;
+ dentry->d_shortname = target->d_shortname;
+ dentry->d_name.name = dentry->d_shortname.string;
} else {
/*
* Both are internal.
*/
- unsigned int i;
- BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
- for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
- swap(((long *) &dentry->d_iname)[i],
- ((long *) &target->d_iname)[i]);
- }
+ for (int i = 0; i < DNAME_INLINE_WORDS; i++)
+ swap(dentry->d_shortname.words[i],
+ target->d_shortname.words[i]);
}
}
swap(dentry->d_name.hash_len, target->d_name.hash_len);
@@ -2764,16 +2777,15 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
if (unlikely(dname_external(dentry)))
old_name = external_name(dentry);
if (unlikely(dname_external(target))) {
- atomic_inc(&external_name(target)->u.count);
+ atomic_inc(&external_name(target)->count);
dentry->d_name = target->d_name;
} else {
- memcpy(dentry->d_iname, target->d_name.name,
- target->d_name.len + 1);
- dentry->d_name.name = dentry->d_iname;
+ dentry->d_shortname = target->d_shortname;
+ dentry->d_name.name = dentry->d_shortname.string;
dentry->d_name.hash_len = target->d_name.hash_len;
}
- if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
- kfree_rcu(old_name, u.head);
+ if (old_name && likely(atomic_dec_and_test(&old_name->count)))
+ kfree_rcu(old_name, head);
}
/*
@@ -2954,7 +2966,12 @@ static int __d_unalias(struct dentry *dentry, struct dentry *alias)
goto out_err;
m2 = &alias->d_parent->d_inode->i_rwsem;
out_unalias:
+ if (alias->d_op && alias->d_op->d_unalias_trylock &&
+ !alias->d_op->d_unalias_trylock(alias))
+ goto out_err;
__d_move(alias, dentry, false);
+ if (alias->d_op && alias->d_op->d_unalias_unlock)
+ alias->d_op->d_unalias_unlock(alias);
ret = 0;
out_err:
if (m2)
@@ -3102,12 +3119,12 @@ void d_mark_tmpfile(struct file *file, struct inode *inode)
{
struct dentry *dentry = file->f_path.dentry;
- BUG_ON(dentry->d_name.name != dentry->d_iname ||
+ BUG_ON(dname_external(dentry) ||
!hlist_unhashed(&dentry->d_u.d_alias) ||
!d_unlinked(dentry));
spin_lock(&dentry->d_parent->d_lock);
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
+ dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu",
(unsigned long long)inode->i_ino);
spin_unlock(&dentry->d_lock);
spin_unlock(&dentry->d_parent->d_lock);
@@ -3195,7 +3212,7 @@ static void __init dcache_init(void)
*/
dentry_cache = KMEM_CACHE_USERCOPY(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
- d_iname);
+ d_shortname.string);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index e33cc77699cd..69e9ddcb113d 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -94,6 +94,7 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
fsd = d_fsd;
} else {
struct inode *inode = dentry->d_inode;
+ unsigned int methods = 0;
if (WARN_ON(mode == DBGFS_GET_ALREADY))
return -EINVAL;
@@ -106,25 +107,28 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
const struct debugfs_short_fops *ops;
ops = fsd->short_fops = DEBUGFS_I(inode)->short_fops;
if (ops->llseek)
- fsd->methods |= HAS_LSEEK;
+ methods |= HAS_LSEEK;
if (ops->read)
- fsd->methods |= HAS_READ;
+ methods |= HAS_READ;
if (ops->write)
- fsd->methods |= HAS_WRITE;
+ methods |= HAS_WRITE;
+ fsd->real_fops = NULL;
} else {
const struct file_operations *ops;
ops = fsd->real_fops = DEBUGFS_I(inode)->real_fops;
if (ops->llseek)
- fsd->methods |= HAS_LSEEK;
+ methods |= HAS_LSEEK;
if (ops->read)
- fsd->methods |= HAS_READ;
+ methods |= HAS_READ;
if (ops->write)
- fsd->methods |= HAS_WRITE;
+ methods |= HAS_WRITE;
if (ops->unlocked_ioctl)
- fsd->methods |= HAS_IOCTL;
+ methods |= HAS_IOCTL;
if (ops->poll)
- fsd->methods |= HAS_POLL;
+ methods |= HAS_POLL;
+ fsd->short_fops = NULL;
}
+ fsd->methods = methods;
refcount_set(&fsd->active_users, 1);
init_completion(&fsd->active_users_drained);
INIT_LIST_HEAD(&fsd->cancellations);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index b20e565b9c5e..1096ff8562fa 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -45,7 +45,7 @@ static int pty_limit_min;
static int pty_limit_max = INT_MAX;
static atomic_t pty_count = ATOMIC_INIT(0);
-static struct ctl_table pty_table[] = {
+static const struct ctl_table pty_table[] = {
{
.procname = "max",
.maxlen = sizeof(int),
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index acaa0825e9bb..1dfd5b81d831 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -17,7 +17,9 @@
/**
* ecryptfs_d_revalidate - revalidate an ecryptfs dentry
- * @dentry: The ecryptfs dentry
+ * @dir: inode of expected parent
+ * @name: expected name
+ * @dentry: dentry to revalidate
* @flags: lookup flags
*
* Called when the VFS needs to revalidate a dentry. This
@@ -28,7 +30,8 @@
* Returns 1 if valid, 0 otherwise.
*
*/
-static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
int rc = 1;
@@ -36,8 +39,15 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE)
- rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+ if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE) {
+ struct inode *lower_dir = ecryptfs_inode_to_lower(dir);
+ struct name_snapshot n;
+
+ take_dentry_name_snapshot(&n, lower_dentry);
+ rc = lower_dentry->d_op->d_revalidate(lower_dir, &n.name,
+ lower_dentry, flags);
+ release_dentry_name_snapshot(&n);
+ }
if (d_really_is_positive(dentry)) {
struct inode *inode = d_inode(dentry);
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 7940241d9355..df2777e05661 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -407,7 +407,7 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
}
it.index = index;
- it.name = (struct qstr)QSTR_INIT(name, strlen(name));
+ it.name = QSTR(name);
if (it.name.len > EROFS_NAME_LEN)
return -ERANGE;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 29f8963bb523..d771e06db738 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -726,7 +726,7 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- lockref_init(&pcl->lockref, 1); /* one ref for this request */
+ lockref_init(&pcl->lockref); /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index f9898e60dd8b..7c0980db77b3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -318,7 +318,7 @@ static void unlist_file(struct epitems_head *head)
static long long_zero;
static long long_max = LONG_MAX;
-static struct ctl_table epoll_table[] = {
+static const struct ctl_table epoll_table[] = {
{
.procname = "max_user_watches",
.data = &max_user_watches,
diff --git a/fs/exec.c b/fs/exec.c
index a49839174472..506cd411f4ac 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2159,7 +2159,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ
return error;
}
-static struct ctl_table fs_exec_sysctls[] = {
+static const struct ctl_table fs_exec_sysctls[] = {
{
.procname = "suid_dumpable",
.data = &suid_dumpable,
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 099f80645072..691dd77b6ab5 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -31,10 +31,9 @@ static inline void exfat_d_version_set(struct dentry *dentry,
* If it happened, the negative dentry isn't actually negative anymore. So,
* drop it.
*/
-static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int exfat_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
- int ret;
-
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -58,11 +57,7 @@ static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
return 0;
- spin_lock(&dentry->d_lock);
- ret = inode_eq_iversion(d_inode(dentry->d_parent),
- exfat_d_version(dentry));
- spin_unlock(&dentry->d_lock);
- return ret;
+ return inode_eq_iversion(dir, exfat_d_version(dentry));
}
/* returns the length of a struct qstr, ignoring trailing dots if necessary */
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 26c4fc37edcf..da4263a14a20 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -322,9 +322,7 @@ restart:
WARN_ON(!list_empty(&ei->i_fc_dilist));
spin_unlock(&sbi->s_fc_lock);
- if (fc_dentry->fcd_name.name &&
- fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
- kfree(fc_dentry->fcd_name.name);
+ release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
return;
@@ -449,22 +447,7 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
node->fcd_op = dentry_update->op;
node->fcd_parent = dir->i_ino;
node->fcd_ino = inode->i_ino;
- if (dentry->d_name.len > DNAME_INLINE_LEN) {
- node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
- if (!node->fcd_name.name) {
- kmem_cache_free(ext4_fc_dentry_cachep, node);
- ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
- mutex_lock(&ei->i_fc_lock);
- return -ENOMEM;
- }
- memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
- dentry->d_name.len);
- } else {
- memcpy(node->fcd_iname, dentry->d_name.name,
- dentry->d_name.len);
- node->fcd_name.name = node->fcd_iname;
- }
- node->fcd_name.len = dentry->d_name.len;
+ take_dentry_name_snapshot(&node->fcd_name, dentry);
INIT_LIST_HEAD(&node->fcd_dilist);
spin_lock(&sbi->s_fc_lock);
if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
@@ -832,7 +815,7 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
{
struct ext4_fc_dentry_info fcd;
struct ext4_fc_tl tl;
- int dlen = fc_dentry->fcd_name.len;
+ int dlen = fc_dentry->fcd_name.name.len;
u8 *dst = ext4_fc_reserve_space(sb,
EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
@@ -847,7 +830,7 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
dst += EXT4_FC_TAG_BASE_LEN;
memcpy(dst, &fcd, sizeof(fcd));
dst += sizeof(fcd);
- memcpy(dst, fc_dentry->fcd_name.name, dlen);
+ memcpy(dst, fc_dentry->fcd_name.name.name, dlen);
return true;
}
@@ -1328,9 +1311,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_del_init(&fc_dentry->fcd_dilist);
spin_unlock(&sbi->s_fc_lock);
- if (fc_dentry->fcd_name.name &&
- fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
- kfree(fc_dentry->fcd_name.name);
+ release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
spin_lock(&sbi->s_fc_lock);
}
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 2fadb2c4780c..3bd534e4dbbf 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -109,8 +109,7 @@ struct ext4_fc_dentry_update {
int fcd_op; /* Type of update create / unlink / link */
int fcd_parent; /* Parent inode number */
int fcd_ino; /* Inode number */
- struct qstr fcd_name; /* Dirent name */
- unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */
+ struct name_snapshot fcd_name; /* Dirent name */
struct list_head fcd_list;
struct list_head fcd_dilist;
};
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 15bf32c21ac0..926c26e90ef8 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,17 +43,13 @@ static inline void vfat_d_version_set(struct dentry *dentry,
* If it happened, the negative dentry isn't actually negative
* anymore. So, drop it.
*/
-static int vfat_revalidate_shortname(struct dentry *dentry)
+static bool vfat_revalidate_shortname(struct dentry *dentry, struct inode *dir)
{
- int ret = 1;
- spin_lock(&dentry->d_lock);
- if (!inode_eq_iversion(d_inode(dentry->d_parent), vfat_d_version(dentry)))
- ret = 0;
- spin_unlock(&dentry->d_lock);
- return ret;
+ return inode_eq_iversion(dir, vfat_d_version(dentry));
}
-static int vfat_revalidate(struct dentry *dentry, unsigned int flags)
+static int vfat_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -61,10 +57,11 @@ static int vfat_revalidate(struct dentry *dentry, unsigned int flags)
/* This is not negative dentry. Always valid. */
if (d_really_is_positive(dentry))
return 1;
- return vfat_revalidate_shortname(dentry);
+ return vfat_revalidate_shortname(dentry, dir);
}
-static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags)
+static int vfat_revalidate_ci(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -97,7 +94,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags)
if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
return 0;
- return vfat_revalidate_shortname(dentry);
+ return vfat_revalidate_shortname(dentry, dir);
}
/* returns the length of a struct qstr, ignoring trailing dots */
diff --git a/fs/file_table.c b/fs/file_table.c
index a32171d2b83f..5c00dc38558d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -106,7 +106,7 @@ static int proc_nr_files(const struct ctl_table *table, int write, void *buffer,
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
-static struct ctl_table fs_stat_sysctls[] = {
+static const struct ctl_table fs_stat_sysctls[] = {
{
.procname = "file-nr",
.data = &files_stat,
@@ -194,6 +194,11 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
* refcount bumps we should reinitialize the reused file first.
*/
file_ref_init(&f->f_ref, 1);
+ /*
+ * Disable permission and pre-content events for all files by default.
+ * They may be enabled later by file_set_fsnotify_mode_from_watchers().
+ */
+ file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM);
return 0;
}
@@ -351,9 +356,7 @@ static struct file *alloc_file(const struct path *path, int flags,
static inline int alloc_path_pseudo(const char *name, struct inode *inode,
struct vfsmount *mnt, struct path *path)
{
- struct qstr this = QSTR_INIT(name, strlen(name));
-
- path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
+ path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name));
if (!path->dentry)
return -ENOMEM;
path->mnt = mntget(mnt);
@@ -377,7 +380,13 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
if (IS_ERR(file)) {
ihold(inode);
path_put(&path);
+ return file;
}
+ /*
+ * Disable all fsnotify events for pseudo files by default.
+ * They may be enabled by caller with file_set_fsnotify_mode().
+ */
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY);
return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);
@@ -402,6 +411,11 @@ struct file *alloc_file_pseudo_noaccount(struct inode *inode,
return file;
}
file_init_path(file, &path, fops);
+ /*
+ * Disable all fsnotify events for pseudo files by default.
+ * They may be enabled by caller with file_set_fsnotify_mode().
+ */
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY);
return file;
}
EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 8674dbfbe59d..ca215a3cba3e 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -63,3 +63,15 @@ config FUSE_PASSTHROUGH
to be performed directly on a backing file.
If you want to allow passthrough operations, answer Y.
+
+config FUSE_IO_URING
+ bool "FUSE communication over io-uring"
+ default y
+ depends on FUSE_FS
+ depends on IO_URING
+ help
+ This allows sending FUSE requests over the io-uring interface and
+ also adds request core affinity.
+
+ If you want to allow fuse server/client communication through io-uring,
+ answer Y
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 2c372180d631..3f0f312a31c1 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -15,5 +15,6 @@ fuse-y += iomode.o
fuse-$(CONFIG_FUSE_DAX) += dax.o
fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
fuse-$(CONFIG_SYSCTL) += sysctl.o
+fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 9abbc2f2894f..0b6ee6dd1fd6 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -240,11 +240,12 @@ static int fuse_send_removemapping(struct inode *inode,
args.opcode = FUSE_REMOVEMAPPING;
args.nodeid = fi->nodeid;
- args.in_numargs = 2;
- args.in_args[0].size = sizeof(*inargp);
- args.in_args[0].value = inargp;
- args.in_args[1].size = inargp->count * sizeof(*remove_one);
- args.in_args[1].value = remove_one;
+ args.in_numargs = 3;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = sizeof(*inargp);
+ args.in_args[1].value = inargp;
+ args.in_args[2].size = inargp->count * sizeof(*remove_one);
+ args.in_args[2].value = remove_one;
return fuse_simple_request(fm, &args);
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 27ccae63495d..5b5f789b37eb 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -6,7 +6,9 @@
See the file COPYING.
*/
+#include "dev_uring_i.h"
#include "fuse_i.h"
+#include "fuse_dev_i.h"
#include <linux/init.h>
#include <linux/module.h>
@@ -28,23 +30,8 @@
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
-/* Ordinary requests have even IDs, while interrupts IDs are odd */
-#define FUSE_INT_REQ_BIT (1ULL << 0)
-#define FUSE_REQ_ID_STEP (1ULL << 1)
-
static struct kmem_cache *fuse_req_cachep;
-static void end_requests(struct list_head *head);
-
-static struct fuse_dev *fuse_get_dev(struct file *file)
-{
- /*
- * Lockless access is OK, because file->private data is set
- * once during mount and is valid until the file is released.
- */
- return READ_ONCE(file->private_data);
-}
-
static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
{
INIT_LIST_HEAD(&req->list);
@@ -89,7 +76,8 @@ void fuse_set_initialized(struct fuse_conn *fc)
static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
{
- return !fc->initialized || (for_background && fc->blocked);
+ return !fc->initialized || (for_background && fc->blocked) ||
+ (fc->io_uring && !fuse_uring_ready(fc));
}
static void fuse_drop_waiting(struct fuse_conn *fc)
@@ -234,7 +222,7 @@ u64 fuse_get_unique(struct fuse_iqueue *fiq)
}
EXPORT_SYMBOL_GPL(fuse_get_unique);
-static unsigned int fuse_req_hash(u64 unique)
+unsigned int fuse_req_hash(u64 unique)
{
return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
}
@@ -250,7 +238,8 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
}
-static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget)
+void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
+ struct fuse_forget_link *forget)
{
spin_lock(&fiq->lock);
if (fiq->connected) {
@@ -263,7 +252,7 @@ static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_li
}
}
-static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
spin_lock(&fiq->lock);
if (list_empty(&req->intr_entry)) {
@@ -580,7 +569,25 @@ ssize_t __fuse_simple_request(struct mnt_idmap *idmap,
return ret;
}
-static bool fuse_request_queue_background(struct fuse_req *req)
+#ifdef CONFIG_FUSE_IO_URING
+static bool fuse_request_queue_background_uring(struct fuse_conn *fc,
+ struct fuse_req *req)
+{
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ req->in.h.unique = fuse_get_unique(fiq);
+ req->in.h.len = sizeof(struct fuse_in_header) +
+ fuse_len_args(req->args->in_numargs,
+ (struct fuse_arg *) req->args->in_args);
+
+ return fuse_uring_queue_bq_req(req);
+}
+#endif
+
+/*
+ * @return true if queued
+ */
+static int fuse_request_queue_background(struct fuse_req *req)
{
struct fuse_mount *fm = req->fm;
struct fuse_conn *fc = fm->fc;
@@ -592,6 +599,12 @@ static bool fuse_request_queue_background(struct fuse_req *req)
atomic_inc(&fc->num_waiting);
}
__set_bit(FR_ISREPLY, &req->flags);
+
+#ifdef CONFIG_FUSE_IO_URING
+ if (fuse_uring_ready(fc))
+ return fuse_request_queue_background_uring(fc, req);
+#endif
+
spin_lock(&fc->bg_lock);
if (likely(fc->connected)) {
fc->num_background++;
@@ -692,22 +705,8 @@ static int unlock_request(struct fuse_req *req)
return err;
}
-struct fuse_copy_state {
- int write;
- struct fuse_req *req;
- struct iov_iter *iter;
- struct pipe_buffer *pipebufs;
- struct pipe_buffer *currbuf;
- struct pipe_inode_info *pipe;
- unsigned long nr_segs;
- struct page *pg;
- unsigned len;
- unsigned offset;
- unsigned move_pages:1;
-};
-
-static void fuse_copy_init(struct fuse_copy_state *cs, int write,
- struct iov_iter *iter)
+void fuse_copy_init(struct fuse_copy_state *cs, int write,
+ struct iov_iter *iter)
{
memset(cs, 0, sizeof(*cs));
cs->write = write;
@@ -814,6 +813,9 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
*size -= ncpy;
cs->len -= ncpy;
cs->offset += ncpy;
+ if (cs->is_uring)
+ cs->ring.copied_sz += ncpy;
+
return ncpy;
}
@@ -1068,9 +1070,9 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
}
/* Copy request arguments to/from userspace buffer */
-static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
- unsigned argpages, struct fuse_arg *args,
- int zeroing)
+int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+ unsigned argpages, struct fuse_arg *args,
+ int zeroing)
{
int err = 0;
unsigned i;
@@ -1760,7 +1762,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
args = &ap->args;
args->nodeid = outarg->nodeid;
args->opcode = FUSE_NOTIFY_REPLY;
- args->in_numargs = 2;
+ args->in_numargs = 3;
args->in_pages = true;
args->end = fuse_retrieve_end;
@@ -1788,9 +1790,10 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
}
ra->inarg.offset = outarg->offset;
ra->inarg.size = total_len;
- args->in_args[0].size = sizeof(ra->inarg);
- args->in_args[0].value = &ra->inarg;
- args->in_args[1].size = total_len;
+ fuse_set_zero_arg0(args);
+ args->in_args[1].size = sizeof(ra->inarg);
+ args->in_args[1].value = &ra->inarg;
+ args->in_args[2].size = total_len;
err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
if (err)
@@ -1885,7 +1888,7 @@ static void fuse_resend(struct fuse_conn *fc)
spin_unlock(&fiq->lock);
list_for_each_entry(req, &to_queue, list)
clear_bit(FR_PENDING, &req->flags);
- end_requests(&to_queue);
+ fuse_dev_end_requests(&to_queue);
return;
}
/* iq and pq requests are both oldest to newest */
@@ -1934,7 +1937,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
}
/* Look up request on processing list by unique ID */
-static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
+struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique)
{
unsigned int hash = fuse_req_hash(unique);
struct fuse_req *req;
@@ -1946,10 +1949,17 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
return NULL;
}
-static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
- unsigned nbytes)
+int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
+ unsigned nbytes)
{
- unsigned reqsize = sizeof(struct fuse_out_header);
+
+ unsigned int reqsize = 0;
+
+ /*
+ * Uring has all headers separated from args - args is payload only
+ */
+ if (!cs->is_uring)
+ reqsize = sizeof(struct fuse_out_header);
reqsize += fuse_len_args(args->out_numargs, args->out_args);
@@ -2011,7 +2021,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
spin_lock(&fpq->lock);
req = NULL;
if (fpq->connected)
- req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
+ req = fuse_request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);
err = -ENOENT;
if (!req) {
@@ -2049,7 +2059,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
if (oh.error)
err = nbytes != sizeof(oh) ? -EINVAL : 0;
else
- err = copy_out_args(cs, req->args, nbytes);
+ err = fuse_copy_out_args(cs, req->args, nbytes);
fuse_copy_finish(cs);
spin_lock(&fpq->lock);
@@ -2204,7 +2214,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
}
/* Abort all requests on the given list (pending or processing) */
-static void end_requests(struct list_head *head)
+void fuse_dev_end_requests(struct list_head *head)
{
while (!list_empty(head)) {
struct fuse_req *req;
@@ -2307,7 +2317,13 @@ void fuse_abort_conn(struct fuse_conn *fc)
wake_up_all(&fc->blocked_waitq);
spin_unlock(&fc->lock);
- end_requests(&to_end);
+ fuse_dev_end_requests(&to_end);
+
+ /*
+ * fc->lock must not be taken to avoid conflicts with io-uring
+ * locks
+ */
+ fuse_uring_abort(fc);
} else {
spin_unlock(&fc->lock);
}
@@ -2319,6 +2335,8 @@ void fuse_wait_aborted(struct fuse_conn *fc)
/* matches implicit memory barrier in fuse_drop_waiting() */
smp_mb();
wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0);
+
+ fuse_uring_wait_stopped_queues(fc);
}
int fuse_dev_release(struct inode *inode, struct file *file)
@@ -2337,7 +2355,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
list_splice_init(&fpq->processing[i], &to_end);
spin_unlock(&fpq->lock);
- end_requests(&to_end);
+ fuse_dev_end_requests(&to_end);
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
@@ -2475,6 +2493,9 @@ const struct file_operations fuse_dev_operations = {
.fasync = fuse_dev_fasync,
.unlocked_ioctl = fuse_dev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
+#ifdef CONFIG_FUSE_IO_URING
+ .uring_cmd = fuse_uring_cmd,
+#endif
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
new file mode 100644
index 000000000000..ebd2931b4f2a
--- /dev/null
+++ b/fs/fuse/dev_uring.c
@@ -0,0 +1,1319 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (c) 2023-2024 DataDirect Networks.
+ */
+
+#include "fuse_i.h"
+#include "dev_uring_i.h"
+#include "fuse_dev_i.h"
+
+#include <linux/fs.h>
+#include <linux/io_uring/cmd.h>
+
+static bool __read_mostly enable_uring;
+module_param(enable_uring, bool, 0644);
+MODULE_PARM_DESC(enable_uring,
+ "Enable userspace communication through io-uring");
+
+#define FUSE_URING_IOV_SEGS 2 /* header and payload */
+
+
+bool fuse_uring_enabled(void)
+{
+ return enable_uring;
+}
+
+struct fuse_uring_pdu {
+ struct fuse_ring_ent *ent;
+};
+
+static const struct fuse_iqueue_ops fuse_io_uring_ops;
+
+static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd,
+ struct fuse_ring_ent *ring_ent)
+{
+ struct fuse_uring_pdu *pdu =
+ io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu);
+
+ pdu->ent = ring_ent;
+}
+
+static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd)
+{
+ struct fuse_uring_pdu *pdu =
+ io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu);
+
+ return pdu->ent;
+}
+
+static void fuse_uring_flush_bg(struct fuse_ring_queue *queue)
+{
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_conn *fc = ring->fc;
+
+ lockdep_assert_held(&queue->lock);
+ lockdep_assert_held(&fc->bg_lock);
+
+ /*
+ * Allow one bg request per queue, ignoring global fc limits.
+ * This prevents a single queue from consuming all resources and
+ * eliminates the need for remote queue wake-ups when global
+ * limits are met but this queue has no more waiting requests.
+ */
+ while ((fc->active_background < fc->max_background ||
+ !queue->active_background) &&
+ (!list_empty(&queue->fuse_req_bg_queue))) {
+ struct fuse_req *req;
+
+ req = list_first_entry(&queue->fuse_req_bg_queue,
+ struct fuse_req, list);
+ fc->active_background++;
+ queue->active_background++;
+
+ list_move_tail(&req->list, &queue->fuse_req_queue);
+ }
+}
+
+static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req,
+ int error)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_conn *fc = ring->fc;
+
+ lockdep_assert_not_held(&queue->lock);
+ spin_lock(&queue->lock);
+ ent->fuse_req = NULL;
+ if (test_bit(FR_BACKGROUND, &req->flags)) {
+ queue->active_background--;
+ spin_lock(&fc->bg_lock);
+ fuse_uring_flush_bg(queue);
+ spin_unlock(&fc->bg_lock);
+ }
+
+ spin_unlock(&queue->lock);
+
+ if (error)
+ req->out.h.error = error;
+
+ clear_bit(FR_SENT, &req->flags);
+ fuse_request_end(req);
+}
+
+/* Abort all list queued request on the given ring queue */
+static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue)
+{
+ struct fuse_req *req;
+ LIST_HEAD(req_list);
+
+ spin_lock(&queue->lock);
+ list_for_each_entry(req, &queue->fuse_req_queue, list)
+ clear_bit(FR_PENDING, &req->flags);
+ list_splice_init(&queue->fuse_req_queue, &req_list);
+ spin_unlock(&queue->lock);
+
+ /* must not hold queue lock to avoid order issues with fi->lock */
+ fuse_dev_end_requests(&req_list);
+}
+
+void fuse_uring_abort_end_requests(struct fuse_ring *ring)
+{
+ int qid;
+ struct fuse_ring_queue *queue;
+ struct fuse_conn *fc = ring->fc;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ queue = READ_ONCE(ring->queues[qid]);
+ if (!queue)
+ continue;
+
+ queue->stopped = true;
+
+ WARN_ON_ONCE(ring->fc->max_background != UINT_MAX);
+ spin_lock(&queue->lock);
+ spin_lock(&fc->bg_lock);
+ fuse_uring_flush_bg(queue);
+ spin_unlock(&fc->bg_lock);
+ spin_unlock(&queue->lock);
+ fuse_uring_abort_end_queue_requests(queue);
+ }
+}
+
+void fuse_uring_destruct(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+ int qid;
+
+ if (!ring)
+ return;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = ring->queues[qid];
+ struct fuse_ring_ent *ent, *next;
+
+ if (!queue)
+ continue;
+
+ WARN_ON(!list_empty(&queue->ent_avail_queue));
+ WARN_ON(!list_empty(&queue->ent_w_req_queue));
+ WARN_ON(!list_empty(&queue->ent_commit_queue));
+ WARN_ON(!list_empty(&queue->ent_in_userspace));
+
+ list_for_each_entry_safe(ent, next, &queue->ent_released,
+ list) {
+ list_del_init(&ent->list);
+ kfree(ent);
+ }
+
+ kfree(queue->fpq.processing);
+ kfree(queue);
+ ring->queues[qid] = NULL;
+ }
+
+ kfree(ring->queues);
+ kfree(ring);
+ fc->ring = NULL;
+}
+
+/*
+ * Basic ring setup for this connection based on the provided configuration
+ */
+static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring;
+ size_t nr_queues = num_possible_cpus();
+ struct fuse_ring *res = NULL;
+ size_t max_payload_size;
+
+ ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT);
+ if (!ring)
+ return NULL;
+
+ ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *),
+ GFP_KERNEL_ACCOUNT);
+ if (!ring->queues)
+ goto out_err;
+
+ max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write);
+ max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE);
+
+ spin_lock(&fc->lock);
+ if (fc->ring) {
+ /* race, another thread created the ring in the meantime */
+ spin_unlock(&fc->lock);
+ res = fc->ring;
+ goto out_err;
+ }
+
+ init_waitqueue_head(&ring->stop_waitq);
+
+ fc->ring = ring;
+ ring->nr_queues = nr_queues;
+ ring->fc = fc;
+ ring->max_payload_sz = max_payload_size;
+ atomic_set(&ring->queue_refs, 0);
+
+ spin_unlock(&fc->lock);
+ return ring;
+
+out_err:
+ kfree(ring->queues);
+ kfree(ring);
+ return res;
+}
+
+static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring,
+ int qid)
+{
+ struct fuse_conn *fc = ring->fc;
+ struct fuse_ring_queue *queue;
+ struct list_head *pq;
+
+ queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT);
+ if (!queue)
+ return NULL;
+ pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL);
+ if (!pq) {
+ kfree(queue);
+ return NULL;
+ }
+
+ queue->qid = qid;
+ queue->ring = ring;
+ spin_lock_init(&queue->lock);
+
+ INIT_LIST_HEAD(&queue->ent_avail_queue);
+ INIT_LIST_HEAD(&queue->ent_commit_queue);
+ INIT_LIST_HEAD(&queue->ent_w_req_queue);
+ INIT_LIST_HEAD(&queue->ent_in_userspace);
+ INIT_LIST_HEAD(&queue->fuse_req_queue);
+ INIT_LIST_HEAD(&queue->fuse_req_bg_queue);
+ INIT_LIST_HEAD(&queue->ent_released);
+
+ queue->fpq.processing = pq;
+ fuse_pqueue_init(&queue->fpq);
+
+ spin_lock(&fc->lock);
+ if (ring->queues[qid]) {
+ spin_unlock(&fc->lock);
+ kfree(queue->fpq.processing);
+ kfree(queue);
+ return ring->queues[qid];
+ }
+
+ /*
+ * write_once and lock as the caller mostly doesn't take the lock at all
+ */
+ WRITE_ONCE(ring->queues[qid], queue);
+ spin_unlock(&fc->lock);
+
+ return queue;
+}
+
+static void fuse_uring_stop_fuse_req_end(struct fuse_req *req)
+{
+ clear_bit(FR_SENT, &req->flags);
+ req->out.h.error = -ECONNABORTED;
+ fuse_request_end(req);
+}
+
+/*
+ * Release a request/entry on connection tear down
+ */
+static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent)
+{
+ struct fuse_req *req;
+ struct io_uring_cmd *cmd;
+
+ struct fuse_ring_queue *queue = ent->queue;
+
+ spin_lock(&queue->lock);
+ cmd = ent->cmd;
+ ent->cmd = NULL;
+ req = ent->fuse_req;
+ ent->fuse_req = NULL;
+ if (req) {
+ /* remove entry from queue->fpq->processing */
+ list_del_init(&req->list);
+ }
+
+ /*
+ * The entry must not be freed immediately, due to access of direct
+ * pointer access of entries through IO_URING_F_CANCEL - there is a risk
+ * of race between daemon termination (which triggers IO_URING_F_CANCEL
+ * and accesses entries without checking the list state first
+ */
+ list_move(&ent->list, &queue->ent_released);
+ ent->state = FRRS_RELEASED;
+ spin_unlock(&queue->lock);
+
+ if (cmd)
+ io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED);
+
+ if (req)
+ fuse_uring_stop_fuse_req_end(req);
+}
+
+static void fuse_uring_stop_list_entries(struct list_head *head,
+ struct fuse_ring_queue *queue,
+ enum fuse_ring_req_state exp_state)
+{
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_ring_ent *ent, *next;
+ ssize_t queue_refs = SSIZE_MAX;
+ LIST_HEAD(to_teardown);
+
+ spin_lock(&queue->lock);
+ list_for_each_entry_safe(ent, next, head, list) {
+ if (ent->state != exp_state) {
+ pr_warn("entry teardown qid=%d state=%d expected=%d",
+ queue->qid, ent->state, exp_state);
+ continue;
+ }
+
+ ent->state = FRRS_TEARDOWN;
+ list_move(&ent->list, &to_teardown);
+ }
+ spin_unlock(&queue->lock);
+
+ /* no queue lock to avoid lock order issues */
+ list_for_each_entry_safe(ent, next, &to_teardown, list) {
+ fuse_uring_entry_teardown(ent);
+ queue_refs = atomic_dec_return(&ring->queue_refs);
+ WARN_ON_ONCE(queue_refs < 0);
+ }
+}
+
+static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue)
+{
+ fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue,
+ FRRS_USERSPACE);
+ fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue,
+ FRRS_AVAILABLE);
+}
+
+/*
+ * Log state debug info
+ */
+static void fuse_uring_log_ent_state(struct fuse_ring *ring)
+{
+ int qid;
+ struct fuse_ring_ent *ent;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = ring->queues[qid];
+
+ if (!queue)
+ continue;
+
+ spin_lock(&queue->lock);
+ /*
+ * Log entries from the intermediate queue, the other queues
+ * should be empty
+ */
+ list_for_each_entry(ent, &queue->ent_w_req_queue, list) {
+ pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n",
+ ring, qid, ent, ent->state);
+ }
+ list_for_each_entry(ent, &queue->ent_commit_queue, list) {
+ pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n",
+ ring, qid, ent, ent->state);
+ }
+ spin_unlock(&queue->lock);
+ }
+ ring->stop_debug_log = 1;
+}
+
+static void fuse_uring_async_stop_queues(struct work_struct *work)
+{
+ int qid;
+ struct fuse_ring *ring =
+ container_of(work, struct fuse_ring, async_teardown_work.work);
+
+ /* XXX code dup */
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
+
+ if (!queue)
+ continue;
+
+ fuse_uring_teardown_entries(queue);
+ }
+
+ /*
+ * Some ring entries might be in the middle of IO operations,
+ * i.e. in process to get handled by file_operations::uring_cmd
+ * or on the way to userspace - we could handle that with conditions in
+ * run time code, but easier/cleaner to have an async tear down handler
+ * If there are still queue references left
+ */
+ if (atomic_read(&ring->queue_refs) > 0) {
+ if (time_after(jiffies,
+ ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT))
+ fuse_uring_log_ent_state(ring);
+
+ schedule_delayed_work(&ring->async_teardown_work,
+ FUSE_URING_TEARDOWN_INTERVAL);
+ } else {
+ wake_up_all(&ring->stop_waitq);
+ }
+}
+
+/*
+ * Stop the ring queues
+ */
+void fuse_uring_stop_queues(struct fuse_ring *ring)
+{
+ int qid;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
+
+ if (!queue)
+ continue;
+
+ fuse_uring_teardown_entries(queue);
+ }
+
+ if (atomic_read(&ring->queue_refs) > 0) {
+ ring->teardown_time = jiffies;
+ INIT_DELAYED_WORK(&ring->async_teardown_work,
+ fuse_uring_async_stop_queues);
+ schedule_delayed_work(&ring->async_teardown_work,
+ FUSE_URING_TEARDOWN_INTERVAL);
+ } else {
+ wake_up_all(&ring->stop_waitq);
+ }
+}
+
+/*
+ * Handle IO_URING_F_CANCEL, typically should come on daemon termination.
+ *
+ * Releasing the last entry should trigger fuse_dev_release() if
+ * the daemon was terminated
+ */
+static void fuse_uring_cancel(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
+ struct fuse_ring_queue *queue;
+ bool need_cmd_done = false;
+
+ /*
+ * direct access on ent - it must not be destructed as long as
+ * IO_URING_F_CANCEL might come up
+ */
+ queue = ent->queue;
+ spin_lock(&queue->lock);
+ if (ent->state == FRRS_AVAILABLE) {
+ ent->state = FRRS_USERSPACE;
+ list_move(&ent->list, &queue->ent_in_userspace);
+ need_cmd_done = true;
+ ent->cmd = NULL;
+ }
+ spin_unlock(&queue->lock);
+
+ if (need_cmd_done) {
+ /* no queue lock to avoid lock order issues */
+ io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags);
+ }
+}
+
+static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags,
+ struct fuse_ring_ent *ring_ent)
+{
+ uring_cmd_set_ring_ent(cmd, ring_ent);
+ io_uring_cmd_mark_cancelable(cmd, issue_flags);
+}
+
+/*
+ * Checks for errors and stores it into the request
+ */
+static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
+ struct fuse_req *req,
+ struct fuse_conn *fc)
+{
+ int err;
+
+ err = -EINVAL;
+ if (oh->unique == 0) {
+ /* Not supported through io-uring yet */
+ pr_warn_once("notify through fuse-io-uring not supported\n");
+ goto err;
+ }
+
+ if (oh->error <= -ERESTARTSYS || oh->error > 0)
+ goto err;
+
+ if (oh->error) {
+ err = oh->error;
+ goto err;
+ }
+
+ err = -ENOENT;
+ if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) {
+ pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n",
+ req->in.h.unique,
+ oh->unique & ~FUSE_INT_REQ_BIT);
+ goto err;
+ }
+
+ /*
+ * Is it an interrupt reply ID?
+ * XXX: Not supported through fuse-io-uring yet, it should not even
+ * find the request - should not happen.
+ */
+ WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT);
+
+ err = 0;
+err:
+ return err;
+}
+
+static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
+ struct fuse_req *req,
+ struct fuse_ring_ent *ent)
+{
+ struct fuse_copy_state cs;
+ struct fuse_args *args = req->args;
+ struct iov_iter iter;
+ int err;
+ struct fuse_uring_ent_in_out ring_in_out;
+
+ err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out,
+ sizeof(ring_in_out));
+ if (err)
+ return -EFAULT;
+
+ err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz,
+ &iter);
+ if (err)
+ return err;
+
+ fuse_copy_init(&cs, 0, &iter);
+ cs.is_uring = 1;
+ cs.req = req;
+
+ return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz);
+}
+
+ /*
+ * Copy data from the req to the ring buffer
+ */
+static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
+ struct fuse_ring_ent *ent)
+{
+ struct fuse_copy_state cs;
+ struct fuse_args *args = req->args;
+ struct fuse_in_arg *in_args = args->in_args;
+ int num_args = args->in_numargs;
+ int err;
+ struct iov_iter iter;
+ struct fuse_uring_ent_in_out ent_in_out = {
+ .flags = 0,
+ .commit_id = req->in.h.unique,
+ };
+
+ err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter);
+ if (err) {
+ pr_info_ratelimited("fuse: Import of user buffer failed\n");
+ return err;
+ }
+
+ fuse_copy_init(&cs, 1, &iter);
+ cs.is_uring = 1;
+ cs.req = req;
+
+ if (num_args > 0) {
+ /*
+ * Expectation is that the first argument is the per op header.
+ * Some op code have that as zero size.
+ */
+ if (args->in_args[0].size > 0) {
+ err = copy_to_user(&ent->headers->op_in, in_args->value,
+ in_args->size);
+ if (err) {
+ pr_info_ratelimited(
+ "Copying the header failed.\n");
+ return -EFAULT;
+ }
+ }
+ in_args++;
+ num_args--;
+ }
+
+ /* copy the payload */
+ err = fuse_copy_args(&cs, num_args, args->in_pages,
+ (struct fuse_arg *)in_args, 0);
+ if (err) {
+ pr_info_ratelimited("%s fuse_copy_args failed\n", __func__);
+ return err;
+ }
+
+ ent_in_out.payload_sz = cs.ring.copied_sz;
+ err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out,
+ sizeof(ent_in_out));
+ return err ? -EFAULT : 0;
+}
+
+static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_ring *ring = queue->ring;
+ int err;
+
+ err = -EIO;
+ if (WARN_ON(ent->state != FRRS_FUSE_REQ)) {
+ pr_err("qid=%d ring-req=%p invalid state %d on send\n",
+ queue->qid, ent, ent->state);
+ return err;
+ }
+
+ err = -EINVAL;
+ if (WARN_ON(req->in.h.unique == 0))
+ return err;
+
+ /* copy the request */
+ err = fuse_uring_args_to_ring(ring, req, ent);
+ if (unlikely(err)) {
+ pr_info_ratelimited("Copy to ring failed: %d\n", err);
+ return err;
+ }
+
+ /* copy fuse_in_header */
+ err = copy_to_user(&ent->headers->in_out, &req->in.h,
+ sizeof(req->in.h));
+ if (err) {
+ err = -EFAULT;
+ return err;
+ }
+
+ return 0;
+}
+
+static int fuse_uring_prepare_send(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ int err;
+
+ err = fuse_uring_copy_to_ring(ent, req);
+ if (!err)
+ set_bit(FR_SENT, &req->flags);
+ else
+ fuse_uring_req_end(ent, req, err);
+
+ return err;
+}
+
+/*
+ * Write data to the ring buffer and send the request to userspace,
+ * userspace will read it
+ * This is comparable with classical read(/dev/fuse)
+ */
+static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
+ struct fuse_req *req,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ int err;
+ struct io_uring_cmd *cmd;
+
+ err = fuse_uring_prepare_send(ent, req);
+ if (err)
+ return err;
+
+ spin_lock(&queue->lock);
+ cmd = ent->cmd;
+ ent->cmd = NULL;
+ ent->state = FRRS_USERSPACE;
+ list_move(&ent->list, &queue->ent_in_userspace);
+ spin_unlock(&queue->lock);
+
+ io_uring_cmd_done(cmd, 0, 0, issue_flags);
+ return 0;
+}
+
+/*
+ * Make a ring entry available for fuse_req assignment
+ */
+static void fuse_uring_ent_avail(struct fuse_ring_ent *ent,
+ struct fuse_ring_queue *queue)
+{
+ WARN_ON_ONCE(!ent->cmd);
+ list_move(&ent->list, &queue->ent_avail_queue);
+ ent->state = FRRS_AVAILABLE;
+}
+
+/* Used to find the request on SQE commit */
+static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_pqueue *fpq = &queue->fpq;
+ unsigned int hash;
+
+ req->ring_entry = ent;
+ hash = fuse_req_hash(req->in.h.unique);
+ list_move_tail(&req->list, &fpq->processing[hash]);
+}
+
+/*
+ * Assign a fuse queue entry to the given entry
+ */
+static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
+ struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_conn *fc = req->fm->fc;
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ lockdep_assert_held(&queue->lock);
+
+ if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE &&
+ ent->state != FRRS_COMMIT)) {
+ pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid,
+ ent->state);
+ }
+
+ spin_lock(&fiq->lock);
+ clear_bit(FR_PENDING, &req->flags);
+ spin_unlock(&fiq->lock);
+ ent->fuse_req = req;
+ ent->state = FRRS_FUSE_REQ;
+ list_move(&ent->list, &queue->ent_w_req_queue);
+ fuse_uring_add_to_pq(ent, req);
+}
+
+/* Fetch the next fuse request if available */
+static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent)
+ __must_hold(&queue->lock)
+{
+ struct fuse_req *req;
+ struct fuse_ring_queue *queue = ent->queue;
+ struct list_head *req_queue = &queue->fuse_req_queue;
+
+ lockdep_assert_held(&queue->lock);
+
+ /* get and assign the next entry while it is still holding the lock */
+ req = list_first_entry_or_null(req_queue, struct fuse_req, list);
+ if (req)
+ fuse_uring_add_req_to_ring_ent(ent, req);
+
+ return req;
+}
+
+/*
+ * Read data from the ring buffer, which user space has written to
+ * This is comparible with handling of classical write(/dev/fuse).
+ * Also make the ring request available again for new fuse requests.
+ */
+static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req,
+ unsigned int issue_flags)
+{
+ struct fuse_ring *ring = ent->queue->ring;
+ struct fuse_conn *fc = ring->fc;
+ ssize_t err = 0;
+
+ err = copy_from_user(&req->out.h, &ent->headers->in_out,
+ sizeof(req->out.h));
+ if (err) {
+ req->out.h.error = -EFAULT;
+ goto out;
+ }
+
+ err = fuse_uring_out_header_has_err(&req->out.h, req, fc);
+ if (err) {
+ /* req->out.h.error already set */
+ goto out;
+ }
+
+ err = fuse_uring_copy_from_ring(ring, req, ent);
+out:
+ fuse_uring_req_end(ent, req, err);
+}
+
+/*
+ * Get the next fuse req and send it
+ */
+static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent,
+ struct fuse_ring_queue *queue,
+ unsigned int issue_flags)
+{
+ int err;
+ struct fuse_req *req;
+
+retry:
+ spin_lock(&queue->lock);
+ fuse_uring_ent_avail(ent, queue);
+ req = fuse_uring_ent_assign_req(ent);
+ spin_unlock(&queue->lock);
+
+ if (req) {
+ err = fuse_uring_send_next_to_ring(ent, req, issue_flags);
+ if (err)
+ goto retry;
+ }
+}
+
+static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+
+ lockdep_assert_held(&queue->lock);
+
+ if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE))
+ return -EIO;
+
+ ent->state = FRRS_COMMIT;
+ list_move(&ent->list, &queue->ent_commit_queue);
+
+ return 0;
+}
+
+/* FUSE_URING_CMD_COMMIT_AND_FETCH handler */
+static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags,
+ struct fuse_conn *fc)
+{
+ const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+ struct fuse_ring_ent *ent;
+ int err;
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ uint64_t commit_id = READ_ONCE(cmd_req->commit_id);
+ unsigned int qid = READ_ONCE(cmd_req->qid);
+ struct fuse_pqueue *fpq;
+ struct fuse_req *req;
+
+ err = -ENOTCONN;
+ if (!ring)
+ return err;
+
+ if (qid >= ring->nr_queues)
+ return -EINVAL;
+
+ queue = ring->queues[qid];
+ if (!queue)
+ return err;
+ fpq = &queue->fpq;
+
+ if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped))
+ return err;
+
+ spin_lock(&queue->lock);
+ /* Find a request based on the unique ID of the fuse request
+ * This should get revised, as it needs a hash calculation and list
+ * search. And full struct fuse_pqueue is needed (memory overhead).
+ * As well as the link from req to ring_ent.
+ */
+ req = fuse_request_find(fpq, commit_id);
+ err = -ENOENT;
+ if (!req) {
+ pr_info("qid=%d commit_id %llu not found\n", queue->qid,
+ commit_id);
+ spin_unlock(&queue->lock);
+ return err;
+ }
+ list_del_init(&req->list);
+ ent = req->ring_entry;
+ req->ring_entry = NULL;
+
+ err = fuse_ring_ent_set_commit(ent);
+ if (err != 0) {
+ pr_info_ratelimited("qid=%d commit_id %llu state %d",
+ queue->qid, commit_id, ent->state);
+ spin_unlock(&queue->lock);
+ req->out.h.error = err;
+ clear_bit(FR_SENT, &req->flags);
+ fuse_request_end(req);
+ return err;
+ }
+
+ ent->cmd = cmd;
+ spin_unlock(&queue->lock);
+
+ /* without the queue lock, as other locks are taken */
+ fuse_uring_prepare_cancel(cmd, issue_flags, ent);
+ fuse_uring_commit(ent, req, issue_flags);
+
+ /*
+ * Fetching the next request is absolutely required as queued
+ * fuse requests would otherwise not get processed - committing
+ * and fetching is done in one step vs legacy fuse, which has separated
+ * read (fetch request) and write (commit result).
+ */
+ fuse_uring_next_fuse_req(ent, queue, issue_flags);
+ return 0;
+}
+
+static bool is_ring_ready(struct fuse_ring *ring, int current_qid)
+{
+ int qid;
+ struct fuse_ring_queue *queue;
+ bool ready = true;
+
+ for (qid = 0; qid < ring->nr_queues && ready; qid++) {
+ if (current_qid == qid)
+ continue;
+
+ queue = ring->queues[qid];
+ if (!queue) {
+ ready = false;
+ break;
+ }
+
+ spin_lock(&queue->lock);
+ if (list_empty(&queue->ent_avail_queue))
+ ready = false;
+ spin_unlock(&queue->lock);
+ }
+
+ return ready;
+}
+
+/*
+ * fuse_uring_req_fetch command handling
+ */
+static void fuse_uring_do_register(struct fuse_ring_ent *ent,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_conn *fc = ring->fc;
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ fuse_uring_prepare_cancel(cmd, issue_flags, ent);
+
+ spin_lock(&queue->lock);
+ ent->cmd = cmd;
+ fuse_uring_ent_avail(ent, queue);
+ spin_unlock(&queue->lock);
+
+ if (!ring->ready) {
+ bool ready = is_ring_ready(ring, queue->qid);
+
+ if (ready) {
+ WRITE_ONCE(fiq->ops, &fuse_io_uring_ops);
+ WRITE_ONCE(ring->ready, true);
+ wake_up_all(&fc->blocked_waitq);
+ }
+ }
+}
+
+/*
+ * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1]
+ * the payload
+ */
+static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe,
+ struct iovec iov[FUSE_URING_IOV_SEGS])
+{
+ struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ struct iov_iter iter;
+ ssize_t ret;
+
+ if (sqe->len != FUSE_URING_IOV_SEGS)
+ return -EINVAL;
+
+ /*
+ * Direction for buffer access will actually be READ and WRITE,
+ * using write for the import should include READ access as well.
+ */
+ ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS,
+ FUSE_URING_IOV_SEGS, &iov, &iter);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static struct fuse_ring_ent *
+fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
+ struct fuse_ring_queue *queue)
+{
+ struct fuse_ring *ring = queue->ring;
+ struct fuse_ring_ent *ent;
+ size_t payload_size;
+ struct iovec iov[FUSE_URING_IOV_SEGS];
+ int err;
+
+ err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov);
+ if (err) {
+ pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n",
+ err);
+ return ERR_PTR(err);
+ }
+
+ err = -EINVAL;
+ if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) {
+ pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len);
+ return ERR_PTR(err);
+ }
+
+ payload_size = iov[1].iov_len;
+ if (payload_size < ring->max_payload_sz) {
+ pr_info_ratelimited("Invalid req payload len %zu\n",
+ payload_size);
+ return ERR_PTR(err);
+ }
+
+ err = -ENOMEM;
+ ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT);
+ if (!ent)
+ return ERR_PTR(err);
+
+ INIT_LIST_HEAD(&ent->list);
+
+ ent->queue = queue;
+ ent->headers = iov[0].iov_base;
+ ent->payload = iov[1].iov_base;
+
+ atomic_inc(&ring->queue_refs);
+ return ent;
+}
+
+/*
+ * Register header and payload buffer with the kernel and puts the
+ * entry as "ready to get fuse requests" on the queue
+ */
+static int fuse_uring_register(struct io_uring_cmd *cmd,
+ unsigned int issue_flags, struct fuse_conn *fc)
+{
+ const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ struct fuse_ring_ent *ent;
+ int err;
+ unsigned int qid = READ_ONCE(cmd_req->qid);
+
+ err = -ENOMEM;
+ if (!ring) {
+ ring = fuse_uring_create(fc);
+ if (!ring)
+ return err;
+ }
+
+ if (qid >= ring->nr_queues) {
+ pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid);
+ return -EINVAL;
+ }
+
+ queue = ring->queues[qid];
+ if (!queue) {
+ queue = fuse_uring_create_queue(ring, qid);
+ if (!queue)
+ return err;
+ }
+
+ /*
+ * The created queue above does not need to be destructed in
+ * case of entry errors below, will be done at ring destruction time.
+ */
+
+ ent = fuse_uring_create_ring_ent(cmd, queue);
+ if (IS_ERR(ent))
+ return PTR_ERR(ent);
+
+ fuse_uring_do_register(ent, cmd, issue_flags);
+
+ return 0;
+}
+
+/*
+ * Entry function from io_uring to handle the given passthrough command
+ * (op code IORING_OP_URING_CMD)
+ */
+int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct fuse_dev *fud;
+ struct fuse_conn *fc;
+ u32 cmd_op = cmd->cmd_op;
+ int err;
+
+ if ((unlikely(issue_flags & IO_URING_F_CANCEL))) {
+ fuse_uring_cancel(cmd, issue_flags);
+ return 0;
+ }
+
+ /* This extra SQE size holds struct fuse_uring_cmd_req */
+ if (!(issue_flags & IO_URING_F_SQE128))
+ return -EINVAL;
+
+ fud = fuse_get_dev(cmd->file);
+ if (!fud) {
+ pr_info_ratelimited("No fuse device found\n");
+ return -ENOTCONN;
+ }
+ fc = fud->fc;
+
+ /* Once a connection has io-uring enabled on it, it can't be disabled */
+ if (!enable_uring && !fc->io_uring) {
+ pr_info_ratelimited("fuse-io-uring is disabled\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (fc->aborted)
+ return -ECONNABORTED;
+ if (!fc->connected)
+ return -ENOTCONN;
+
+ /*
+ * fuse_uring_register() needs the ring to be initialized,
+ * we need to know the max payload size
+ */
+ if (!fc->initialized)
+ return -EAGAIN;
+
+ switch (cmd_op) {
+ case FUSE_IO_URING_CMD_REGISTER:
+ err = fuse_uring_register(cmd, issue_flags, fc);
+ if (err) {
+ pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n",
+ err);
+ fc->io_uring = 0;
+ wake_up_all(&fc->blocked_waitq);
+ return err;
+ }
+ break;
+ case FUSE_IO_URING_CMD_COMMIT_AND_FETCH:
+ err = fuse_uring_commit_fetch(cmd, issue_flags, fc);
+ if (err) {
+ pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n",
+ err);
+ return err;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return -EIOCBQUEUED;
+}
+
+static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
+ ssize_t ret, unsigned int issue_flags)
+{
+ struct fuse_ring_queue *queue = ent->queue;
+
+ spin_lock(&queue->lock);
+ ent->state = FRRS_USERSPACE;
+ list_move(&ent->list, &queue->ent_in_userspace);
+ ent->cmd = NULL;
+ spin_unlock(&queue->lock);
+
+ io_uring_cmd_done(cmd, ret, 0, issue_flags);
+}
+
+/*
+ * This prepares and sends the ring request in fuse-uring task context.
+ * User buffers are not mapped yet - the application does not have permission
+ * to write to it - this has to be executed in ring task context.
+ */
+static void fuse_uring_send_in_task(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd);
+ struct fuse_ring_queue *queue = ent->queue;
+ int err;
+
+ if (!(issue_flags & IO_URING_F_TASK_DEAD)) {
+ err = fuse_uring_prepare_send(ent, ent->fuse_req);
+ if (err) {
+ fuse_uring_next_fuse_req(ent, queue, issue_flags);
+ return;
+ }
+ } else {
+ err = -ECANCELED;
+ }
+
+ fuse_uring_send(ent, cmd, err, issue_flags);
+}
+
+static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring)
+{
+ unsigned int qid;
+ struct fuse_ring_queue *queue;
+
+ qid = task_cpu(current);
+
+ if (WARN_ONCE(qid >= ring->nr_queues,
+ "Core number (%u) exceeds nr queues (%zu)\n", qid,
+ ring->nr_queues))
+ qid = 0;
+
+ queue = ring->queues[qid];
+ WARN_ONCE(!queue, "Missing queue for qid %d\n", qid);
+
+ return queue;
+}
+
+static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent)
+{
+ struct io_uring_cmd *cmd = ent->cmd;
+
+ uring_cmd_set_ring_ent(cmd, ent);
+ io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task);
+}
+
+/* queue a fuse request and send it if a ring entry is available */
+void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
+{
+ struct fuse_conn *fc = req->fm->fc;
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ struct fuse_ring_ent *ent = NULL;
+ int err;
+
+ err = -EINVAL;
+ queue = fuse_uring_task_to_queue(ring);
+ if (!queue)
+ goto err;
+
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique(fiq);
+
+ spin_lock(&queue->lock);
+ err = -ENOTCONN;
+ if (unlikely(queue->stopped))
+ goto err_unlock;
+
+ ent = list_first_entry_or_null(&queue->ent_avail_queue,
+ struct fuse_ring_ent, list);
+ if (ent)
+ fuse_uring_add_req_to_ring_ent(ent, req);
+ else
+ list_add_tail(&req->list, &queue->fuse_req_queue);
+ spin_unlock(&queue->lock);
+
+ if (ent)
+ fuse_uring_dispatch_ent(ent);
+
+ return;
+
+err_unlock:
+ spin_unlock(&queue->lock);
+err:
+ req->out.h.error = err;
+ clear_bit(FR_PENDING, &req->flags);
+ fuse_request_end(req);
+}
+
+bool fuse_uring_queue_bq_req(struct fuse_req *req)
+{
+ struct fuse_conn *fc = req->fm->fc;
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ struct fuse_ring_ent *ent = NULL;
+
+ queue = fuse_uring_task_to_queue(ring);
+ if (!queue)
+ return false;
+
+ spin_lock(&queue->lock);
+ if (unlikely(queue->stopped)) {
+ spin_unlock(&queue->lock);
+ return false;
+ }
+
+ list_add_tail(&req->list, &queue->fuse_req_bg_queue);
+
+ ent = list_first_entry_or_null(&queue->ent_avail_queue,
+ struct fuse_ring_ent, list);
+ spin_lock(&fc->bg_lock);
+ fc->num_background++;
+ if (fc->num_background == fc->max_background)
+ fc->blocked = 1;
+ fuse_uring_flush_bg(queue);
+ spin_unlock(&fc->bg_lock);
+
+ /*
+ * Due to bg_queue flush limits there might be other bg requests
+ * in the queue that need to be handled first. Or no further req
+ * might be available.
+ */
+ req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req,
+ list);
+ if (ent && req) {
+ fuse_uring_add_req_to_ring_ent(ent, req);
+ spin_unlock(&queue->lock);
+
+ fuse_uring_dispatch_ent(ent);
+ } else {
+ spin_unlock(&queue->lock);
+ }
+
+ return true;
+}
+
+static const struct fuse_iqueue_ops fuse_io_uring_ops = {
+ /* should be send over io-uring as enhancement */
+ .send_forget = fuse_dev_queue_forget,
+
+ /*
+ * could be send over io-uring, but interrupts should be rare,
+ * no need to make the code complex
+ */
+ .send_interrupt = fuse_dev_queue_interrupt,
+ .send_req = fuse_uring_queue_fuse_req,
+};
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
new file mode 100644
index 000000000000..2102b3d0c1ae
--- /dev/null
+++ b/fs/fuse/dev_uring_i.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * FUSE: Filesystem in Userspace
+ * Copyright (c) 2023-2024 DataDirect Networks.
+ */
+
+#ifndef _FS_FUSE_DEV_URING_I_H
+#define _FS_FUSE_DEV_URING_I_H
+
+#include "fuse_i.h"
+
+#ifdef CONFIG_FUSE_IO_URING
+
+#define FUSE_URING_TEARDOWN_TIMEOUT (5 * HZ)
+#define FUSE_URING_TEARDOWN_INTERVAL (HZ/20)
+
+enum fuse_ring_req_state {
+ FRRS_INVALID = 0,
+
+ /* The ring entry received from userspace and it is being processed */
+ FRRS_COMMIT,
+
+ /* The ring entry is waiting for new fuse requests */
+ FRRS_AVAILABLE,
+
+ /* The ring entry got assigned a fuse req */
+ FRRS_FUSE_REQ,
+
+ /* The ring entry is in or on the way to user space */
+ FRRS_USERSPACE,
+
+ /* The ring entry is in teardown */
+ FRRS_TEARDOWN,
+
+ /* The ring entry is released, but not freed yet */
+ FRRS_RELEASED,
+};
+
+/** A fuse ring entry, part of the ring queue */
+struct fuse_ring_ent {
+ /* userspace buffer */
+ struct fuse_uring_req_header __user *headers;
+ void __user *payload;
+
+ /* the ring queue that owns the request */
+ struct fuse_ring_queue *queue;
+
+ /* fields below are protected by queue->lock */
+
+ struct io_uring_cmd *cmd;
+
+ struct list_head list;
+
+ enum fuse_ring_req_state state;
+
+ struct fuse_req *fuse_req;
+};
+
+struct fuse_ring_queue {
+ /*
+ * back pointer to the main fuse uring structure that holds this
+ * queue
+ */
+ struct fuse_ring *ring;
+
+ /* queue id, corresponds to the cpu core */
+ unsigned int qid;
+
+ /*
+ * queue lock, taken when any value in the queue changes _and_ also
+ * a ring entry state changes.
+ */
+ spinlock_t lock;
+
+ /* available ring entries (struct fuse_ring_ent) */
+ struct list_head ent_avail_queue;
+
+ /*
+ * entries in the process of being committed or in the process
+ * to be sent to userspace
+ */
+ struct list_head ent_w_req_queue;
+ struct list_head ent_commit_queue;
+
+ /* entries in userspace */
+ struct list_head ent_in_userspace;
+
+ /* entries that are released */
+ struct list_head ent_released;
+
+ /* fuse requests waiting for an entry slot */
+ struct list_head fuse_req_queue;
+
+ /* background fuse requests */
+ struct list_head fuse_req_bg_queue;
+
+ struct fuse_pqueue fpq;
+
+ unsigned int active_background;
+
+ bool stopped;
+};
+
+/**
+ * Describes if uring is for communication and holds alls the data needed
+ * for uring communication
+ */
+struct fuse_ring {
+ /* back pointer */
+ struct fuse_conn *fc;
+
+ /* number of ring queues */
+ size_t nr_queues;
+
+ /* maximum payload/arg size */
+ size_t max_payload_sz;
+
+ struct fuse_ring_queue **queues;
+
+ /*
+ * Log ring entry states on stop when entries cannot be released
+ */
+ unsigned int stop_debug_log : 1;
+
+ wait_queue_head_t stop_waitq;
+
+ /* async tear down */
+ struct delayed_work async_teardown_work;
+
+ /* log */
+ unsigned long teardown_time;
+
+ atomic_t queue_refs;
+
+ bool ready;
+};
+
+bool fuse_uring_enabled(void);
+void fuse_uring_destruct(struct fuse_conn *fc);
+void fuse_uring_stop_queues(struct fuse_ring *ring);
+void fuse_uring_abort_end_requests(struct fuse_ring *ring);
+int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
+bool fuse_uring_queue_bq_req(struct fuse_req *req);
+
+static inline void fuse_uring_abort(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+
+ if (ring == NULL)
+ return;
+
+ if (atomic_read(&ring->queue_refs) > 0) {
+ fuse_uring_abort_end_requests(ring);
+ fuse_uring_stop_queues(ring);
+ }
+}
+
+static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+
+ if (ring)
+ wait_event(ring->stop_waitq,
+ atomic_read(&ring->queue_refs) == 0);
+}
+
+static inline bool fuse_uring_ready(struct fuse_conn *fc)
+{
+ return fc->ring && fc->ring->ready;
+}
+
+#else /* CONFIG_FUSE_IO_URING */
+
+struct fuse_ring;
+
+static inline void fuse_uring_create(struct fuse_conn *fc)
+{
+}
+
+static inline void fuse_uring_destruct(struct fuse_conn *fc)
+{
+}
+
+static inline bool fuse_uring_enabled(void)
+{
+ return false;
+}
+
+static inline void fuse_uring_abort(struct fuse_conn *fc)
+{
+}
+
+static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc)
+{
+}
+
+static inline bool fuse_uring_ready(struct fuse_conn *fc)
+{
+ return false;
+}
+
+#endif /* CONFIG_FUSE_IO_URING */
+
+#endif /* _FS_FUSE_DEV_URING_I_H */
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bf057cf7098d..198862b086ff 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -175,9 +175,12 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
memset(outarg, 0, sizeof(struct fuse_entry_out));
args->opcode = FUSE_LOOKUP;
args->nodeid = nodeid;
- args->in_numargs = 1;
- args->in_args[0].size = name->len + 1;
- args->in_args[0].value = name->name;
+ args->in_numargs = 3;
+ fuse_set_zero_arg0(args);
+ args->in_args[1].size = name->len;
+ args->in_args[1].value = name->name;
+ args->in_args[2].size = 1;
+ args->in_args[2].value = "";
args->out_numargs = 1;
args->out_args[0].size = sizeof(struct fuse_entry_out);
args->out_args[0].value = outarg;
@@ -192,10 +195,10 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
* the lookup once more. If the lookup results in the same inode,
* then refresh the attributes, timeouts and mark the dentry valid.
*/
-static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
+static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *entry, unsigned int flags)
{
struct inode *inode;
- struct dentry *parent;
struct fuse_mount *fm;
struct fuse_inode *fi;
int ret;
@@ -227,11 +230,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
attr_version = fuse_get_attr_version(fm->fc);
- parent = dget_parent(entry);
- fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)),
- &entry->d_name, &outarg);
+ fuse_lookup_init(fm->fc, &args, get_node_id(dir),
+ name, &outarg);
ret = fuse_simple_request(fm, &args);
- dput(parent);
/* Zero nodeid is same as -ENOENT */
if (!ret && !outarg.nodeid)
ret = -ENOENT;
@@ -265,9 +266,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state))
return -ECHILD;
} else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) {
- parent = dget_parent(entry);
- fuse_advise_use_readdirplus(d_inode(parent));
- dput(parent);
+ fuse_advise_use_readdirplus(dir);
}
}
ret = 1;
@@ -929,11 +928,12 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir,
FUSE_ARGS(args);
args.opcode = FUSE_SYMLINK;
- args.in_numargs = 2;
- args.in_args[0].size = entry->d_name.len + 1;
- args.in_args[0].value = entry->d_name.name;
- args.in_args[1].size = len;
- args.in_args[1].value = link;
+ args.in_numargs = 3;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
+ args.in_args[2].size = len;
+ args.in_args[2].value = link;
return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK);
}
@@ -993,9 +993,10 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
args.opcode = FUSE_UNLINK;
args.nodeid = get_node_id(dir);
- args.in_numargs = 1;
- args.in_args[0].size = entry->d_name.len + 1;
- args.in_args[0].value = entry->d_name.name;
+ args.in_numargs = 2;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
fuse_dir_changed(dir);
@@ -1016,9 +1017,10 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
args.opcode = FUSE_RMDIR;
args.nodeid = get_node_id(dir);
- args.in_numargs = 1;
- args.in_args[0].size = entry->d_name.len + 1;
- args.in_args[0].value = entry->d_name.name;
+ args.in_numargs = 2;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = entry->d_name.len + 1;
+ args.in_args[1].value = entry->d_name.name;
err = fuse_simple_request(fm, &args);
if (!err) {
fuse_dir_changed(dir);
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
new file mode 100644
index 000000000000..3b2bfe1248d3
--- /dev/null
+++ b/fs/fuse/fuse_dev_i.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+ */
+#ifndef _FS_FUSE_DEV_I_H
+#define _FS_FUSE_DEV_I_H
+
+#include <linux/types.h>
+
+/* Ordinary requests have even IDs, while interrupts IDs are odd */
+#define FUSE_INT_REQ_BIT (1ULL << 0)
+#define FUSE_REQ_ID_STEP (1ULL << 1)
+
+struct fuse_arg;
+struct fuse_args;
+struct fuse_pqueue;
+struct fuse_req;
+struct fuse_iqueue;
+struct fuse_forget_link;
+
+struct fuse_copy_state {
+ int write;
+ struct fuse_req *req;
+ struct iov_iter *iter;
+ struct pipe_buffer *pipebufs;
+ struct pipe_buffer *currbuf;
+ struct pipe_inode_info *pipe;
+ unsigned long nr_segs;
+ struct page *pg;
+ unsigned int len;
+ unsigned int offset;
+ unsigned int move_pages:1;
+ unsigned int is_uring:1;
+ struct {
+ unsigned int copied_sz; /* copied size into the user buffer */
+ } ring;
+};
+
+static inline struct fuse_dev *fuse_get_dev(struct file *file)
+{
+ /*
+ * Lockless access is OK, because file->private data is set
+ * once during mount and is valid until the file is released.
+ */
+ return READ_ONCE(file->private_data);
+}
+
+unsigned int fuse_req_hash(u64 unique);
+struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
+
+void fuse_dev_end_requests(struct list_head *head);
+
+void fuse_copy_init(struct fuse_copy_state *cs, int write,
+ struct iov_iter *iter);
+int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
+ unsigned int argpages, struct fuse_arg *args,
+ int zeroing);
+int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
+ unsigned int nbytes);
+void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
+ struct fuse_forget_link *forget);
+void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req);
+
+#endif
+
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 74744c6f2860..fee96fe7887b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -310,7 +310,7 @@ struct fuse_args {
bool is_ext:1;
bool is_pinned:1;
bool invalidate_vmap:1;
- struct fuse_in_arg in_args[3];
+ struct fuse_in_arg in_args[4];
struct fuse_arg out_args[2];
void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
/* Used for kvec iter backed by vmalloc address */
@@ -438,6 +438,10 @@ struct fuse_req {
/** fuse_mount this request belongs to */
struct fuse_mount *fm;
+
+#ifdef CONFIG_FUSE_IO_URING
+ void *ring_entry;
+#endif
};
struct fuse_iqueue;
@@ -863,6 +867,9 @@ struct fuse_conn {
/* Use pages instead of pointer for kernel I/O */
unsigned int use_pages_for_kvec_io:1;
+ /* Use io_uring for communication */
+ unsigned int io_uring;
+
/** Maximum stack depth for passthrough backing files */
int max_stack_depth;
@@ -923,6 +930,11 @@ struct fuse_conn {
/** IDR for backing files ids */
struct idr backing_files_map;
#endif
+
+#ifdef CONFIG_FUSE_IO_URING
+ /** uring connection information*/
+ struct fuse_ring *ring;
+#endif
};
/*
@@ -947,6 +959,19 @@ struct fuse_mount {
struct rcu_head rcu;
};
+/*
+ * Empty header for FUSE opcodes without specific header needs.
+ * Used as a placeholder in args->in_args[0] for consistency
+ * across all FUSE operations, simplifying request handling.
+ */
+struct fuse_zero_header {};
+
+static inline void fuse_set_zero_arg0(struct fuse_args *args)
+{
+ args->in_args[0].size = sizeof(struct fuse_zero_header);
+ args->in_args[0].value = NULL;
+}
+
static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
{
return sb->s_fs_info;
@@ -1220,6 +1245,11 @@ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o);
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
/**
+ * Initialize the fuse processing queue
+ */
+void fuse_pqueue_init(struct fuse_pqueue *fpq);
+
+/**
* Initialize fuse_conn
*/
void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3ce4f4e81d09..e9db2cb8c150 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -7,6 +7,7 @@
*/
#include "fuse_i.h"
+#include "dev_uring_i.h"
#include <linux/pagemap.h>
#include <linux/slab.h>
@@ -937,7 +938,7 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq,
fiq->priv = priv;
}
-static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+void fuse_pqueue_init(struct fuse_pqueue *fpq)
{
unsigned int i;
@@ -992,6 +993,8 @@ static void delayed_release(struct rcu_head *p)
{
struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu);
+ fuse_uring_destruct(fc);
+
put_user_ns(fc->user_ns);
fc->release(fc);
}
@@ -1387,6 +1390,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
else
ok = false;
}
+ if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
+ fc->io_uring = 1;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1446,6 +1451,13 @@ void fuse_send_init(struct fuse_mount *fm)
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
flags |= FUSE_PASSTHROUGH;
+ /*
+ * This is just an information flag for fuse server. No need to check
+ * the reply - server is either sending IORING_OP_URING_CMD or not.
+ */
+ if (fuse_uring_enabled())
+ flags |= FUSE_OVER_IO_URING;
+
ia->in.flags = flags;
ia->in.flags2 = flags >> 32;
diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c
index b272bb333005..63fb1e5bee30 100644
--- a/fs/fuse/sysctl.c
+++ b/fs/fuse/sysctl.c
@@ -13,7 +13,7 @@ static struct ctl_table_header *fuse_table_header;
/* Bound by fuse_init_out max_pages, which is a u16 */
static unsigned int sysctl_fuse_max_pages_limit = 65535;
-static struct ctl_table fuse_sysctl_table[] = {
+static const struct ctl_table fuse_sysctl_table[] = {
{
.procname = "max_pages_limit",
.data = &fuse_max_pages_limit,
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 9f568d345c51..93dfb06b6cea 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -164,9 +164,10 @@ int fuse_removexattr(struct inode *inode, const char *name)
args.opcode = FUSE_REMOVEXATTR;
args.nodeid = get_node_id(inode);
- args.in_numargs = 1;
- args.in_args[0].size = strlen(name) + 1;
- args.in_args[0].value = name;
+ args.in_numargs = 2;
+ fuse_set_zero_arg0(&args);
+ args.in_args[1].size = strlen(name) + 1;
+ args.in_args[1].value = name;
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fm->fc->no_removexattr = 1;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 2e215e8c3c88..95050e719233 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -21,7 +21,9 @@
/**
* gfs2_drevalidate - Check directory lookup consistency
- * @dentry: the mapping to check
+ * @dir: expected parent directory inode
+ * @name: expexted name
+ * @dentry: dentry to check
* @flags: lookup flags
*
* Check to make sure the lookup necessary to arrive at this inode from its
@@ -30,50 +32,43 @@
* Returns: 1 if the dentry is ok, 0 if it isn't
*/
-static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
+static int gfs2_drevalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
- struct dentry *parent;
- struct gfs2_sbd *sdp;
- struct gfs2_inode *dip;
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ struct gfs2_inode *dip = GFS2_I(dir);
struct inode *inode;
struct gfs2_holder d_gh;
struct gfs2_inode *ip = NULL;
- int error, valid = 0;
+ int error, valid;
int had_lock = 0;
if (flags & LOOKUP_RCU)
return -ECHILD;
- parent = dget_parent(dentry);
- sdp = GFS2_SB(d_inode(parent));
- dip = GFS2_I(d_inode(parent));
inode = d_inode(dentry);
if (inode) {
if (is_bad_inode(inode))
- goto out;
+ return 0;
ip = GFS2_I(inode);
}
- if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) {
- valid = 1;
- goto out;
- }
+ if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+ return 1;
had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
if (!had_lock) {
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
if (error)
- goto out;
+ return 0;
}
- error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip);
+ error = gfs2_dir_check(dir, name, ip);
valid = inode ? !error : (error == -ENOENT);
if (!had_lock)
gfs2_glock_dq_uninit(&d_gh);
-out:
- dput(parent);
return valid;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8c4c1f871a88..65c07aa95718 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1201,8 +1201,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
if (glops->go_instantiate)
gl->gl_flags |= BIT(GLF_INSTANTIATE_NEEDED);
gl->gl_name = name;
+ lockref_init(&gl->gl_lockref);
lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass);
- gl->gl_lockref.count = 1;
gl->gl_state = LM_ST_UNLOCKED;
gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 04cadc02e5a6..0727f60ad028 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -51,7 +51,6 @@ static void gfs2_init_glock_once(void *foo)
{
struct gfs2_glock *gl = foo;
- spin_lock_init(&gl->gl_lockref.lock);
INIT_LIST_HEAD(&gl->gl_holders);
INIT_LIST_HEAD(&gl->gl_lru);
INIT_LIST_HEAD(&gl->gl_ail_list);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 58bc5013ca49..2298e06797ac 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -236,7 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str
return NULL;
qd->qd_sbd = sdp;
- lockref_init(&qd->qd_lockref, 0);
+ lockref_init(&qd->qd_lockref);
qd->qd_id = qid;
qd->qd_slot = -1;
INIT_LIST_HEAD(&qd->qd_lru);
@@ -297,7 +297,6 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
spin_lock_bucket(hash);
*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
if (qd == NULL) {
- new_qd->qd_lockref.count++;
*qdp = new_qd;
list_add(&new_qd->qd_list, &sdp->sd_quota_list);
hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
@@ -1450,6 +1449,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
if (qd == NULL)
goto fail_brelse;
+ qd->qd_lockref.count = 0;
set_bit(QDF_CHANGE, &qd->qd_flags);
qd->qd_change = qc_change;
qd->qd_slot = slot;
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 76fa02e3835b..ef54fc8093cf 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -13,7 +13,8 @@
/* dentry case-handling: just lowercase everything */
-static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags)
+static int hfs_revalidate_dentry(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
int diff;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7e51d2cec64b..e0741e468956 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -95,32 +95,17 @@ __uml_setup("hostfs=", hostfs_args,
static char *__dentry_name(struct dentry *dentry, char *name)
{
char *p = dentry_path_raw(dentry, name, PATH_MAX);
- char *root;
- size_t len;
- struct hostfs_fs_info *fsi;
-
- fsi = dentry->d_sb->s_fs_info;
- root = fsi->host_root_path;
- len = strlen(root);
- if (IS_ERR(p)) {
- __putname(name);
- return NULL;
- }
+ struct hostfs_fs_info *fsi = dentry->d_sb->s_fs_info;
+ char *root = fsi->host_root_path;
+ size_t len = strlen(root);
- /*
- * This function relies on the fact that dentry_path_raw() will place
- * the path name at the end of the provided buffer.
- */
- BUG_ON(p + strlen(p) + 1 != name + PATH_MAX);
-
- strscpy(name, root, PATH_MAX);
- if (len > p - name) {
+ if (IS_ERR(p) || len > p - name) {
__putname(name);
return NULL;
}
- if (p > name + len)
- strcpy(name + len, p);
+ memcpy(name, root, len);
+ memmove(name + len, p, name + PATH_MAX - p);
return name;
}
@@ -410,38 +395,33 @@ static const struct file_operations hostfs_dir_fops = {
.fsync = hostfs_fsync,
};
-static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
+static int hostfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
- char *buffer;
- loff_t base = page_offset(page);
- int count = PAGE_SIZE;
- int end_index = inode->i_size >> PAGE_SHIFT;
- int err;
-
- if (page->index >= end_index)
- count = inode->i_size & (PAGE_SIZE-1);
-
- buffer = kmap_local_page(page);
-
- err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
- if (err != count) {
- if (err >= 0)
- err = -EIO;
- mapping_set_error(mapping, err);
- goto out;
+ struct folio *folio = NULL;
+ loff_t i_size = i_size_read(inode);
+ int err = 0;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
+ loff_t pos = folio_pos(folio);
+ size_t count = folio_size(folio);
+ char *buffer;
+ int ret;
+
+ if (count > i_size - pos)
+ count = i_size - pos;
+
+ buffer = kmap_local_folio(folio, 0);
+ ret = write_file(HOSTFS_I(inode)->fd, &pos, buffer, count);
+ kunmap_local(buffer);
+ folio_unlock(folio);
+ if (ret != count) {
+ err = ret < 0 ? ret : -EIO;
+ mapping_set_error(mapping, err);
+ }
}
- if (base > inode->i_size)
- inode->i_size = base;
-
- err = 0;
-
- out:
- kunmap_local(buffer);
- unlock_page(page);
-
return err;
}
@@ -506,11 +486,12 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
}
static const struct address_space_operations hostfs_aops = {
- .writepage = hostfs_writepage,
+ .writepages = hostfs_writepages,
.read_folio = hostfs_read_folio,
.dirty_folio = filemap_dirty_folio,
.write_begin = hostfs_write_begin,
.write_end = hostfs_write_end,
+ .migrate_folio = filemap_migrate_folio,
};
static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st)
diff --git a/fs/inode.c b/fs/inode.c
index 6b4c77268fc0..5587aabdaa5e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -184,7 +184,7 @@ static int proc_nr_inodes(const struct ctl_table *table, int write, void *buffer
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
-static struct ctl_table inodes_sysctls[] = {
+static const struct ctl_table inodes_sysctls[] = {
{
.procname = "inode-nr",
.data = &inodes_stat,
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index d68a4e6ac345..fc8ede43afde 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1576,7 +1576,8 @@ out:
return result;
}
-static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags)
+static int jfs_ci_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
/*
* This is not negative dentry. Always valid.
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 458519e416fe..5f0f8b95f44c 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1109,7 +1109,8 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
return ERR_PTR(rc);
}
-static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
+static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct kernfs_node *kn;
struct kernfs_root *root;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 8502ef68459b..0eb320617d7b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -927,7 +927,7 @@ repeat:
if (!inode)
continue;
- name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
+ name = QSTR(kn->name);
parent = kernfs_get_parent(kn);
if (parent) {
p_inode = ilookup(info->sb, kernfs_ino(parent));
diff --git a/fs/libfs.c b/fs/libfs.c
index 5b6120b19e99..8444f5cc4064 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1782,7 +1782,7 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
{
const struct dentry *parent;
const struct inode *dir;
- char strbuf[DNAME_INLINE_LEN];
+ union shortname_store strbuf;
struct qstr qstr;
/*
@@ -1802,22 +1802,23 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
if (!dir || !IS_CASEFOLDED(dir))
return 1;
+ qstr.len = len;
+ qstr.name = str;
/*
* If the dentry name is stored in-line, then it may be concurrently
* modified by a rename. If this happens, the VFS will eventually retry
* the lookup, so it doesn't matter what ->d_compare() returns.
* However, it's unsafe to call utf8_strncasecmp() with an unstable
* string. Therefore, we have to copy the name into a temporary buffer.
+ * As above, len is guaranteed to match str, so the shortname case
+ * is exactly when str points to ->d_shortname.
*/
- if (len <= DNAME_INLINE_LEN - 1) {
- memcpy(strbuf, str, len);
- strbuf[len] = 0;
- str = strbuf;
+ if (qstr.name == dentry->d_shortname.string) {
+ strbuf = dentry->d_shortname; // NUL is guaranteed to be in there
+ qstr.name = strbuf.string;
/* prevent compiler from optimizing out the temporary buffer */
barrier();
}
- qstr.len = len;
- qstr.name = str;
return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
}
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7ded57ec3a60..2c8eedc6c2cc 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -412,7 +412,7 @@ EXPORT_SYMBOL_GPL(lockd_down);
* Sysctl parameters (same as module parameters, different interface).
*/
-static struct ctl_table nlm_sysctls[] = {
+static const struct ctl_table nlm_sysctls[] = {
{
.procname = "nlm_grace_period",
.data = &nlm_grace_period,
diff --git a/fs/locks.c b/fs/locks.c
index 25afc8d9c9d1..1619cddfa7a4 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -97,7 +97,7 @@ static int leases_enable = 1;
static int lease_break_time = 45;
#ifdef CONFIG_SYSCTL
-static struct ctl_table locks_sysctls[] = {
+static const struct ctl_table locks_sysctls[] = {
{
.procname = "leases-enable",
.data = &leases_enable,
diff --git a/fs/namei.c b/fs/namei.c
index e56c29a22d26..3ab9440c5b93 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -921,10 +921,11 @@ out_dput:
return false;
}
-static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
+static inline int d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
- return dentry->d_op->d_revalidate(dentry, flags);
+ return dentry->d_op->d_revalidate(dir, name, dentry, flags);
else
return 1;
}
@@ -1099,7 +1100,7 @@ static int sysctl_protected_fifos __read_mostly;
static int sysctl_protected_regular __read_mostly;
#ifdef CONFIG_SYSCTL
-static struct ctl_table namei_sysctls[] = {
+static const struct ctl_table namei_sysctls[] = {
{
.procname = "protected_symlinks",
.data = &sysctl_protected_symlinks,
@@ -1652,7 +1653,7 @@ static struct dentry *lookup_dcache(const struct qstr *name,
{
struct dentry *dentry = d_lookup(dir, name);
if (dentry) {
- int error = d_revalidate(dentry, flags);
+ int error = d_revalidate(dir->d_inode, name, dentry, flags);
if (unlikely(error <= 0)) {
if (!error)
d_invalidate(dentry);
@@ -1737,19 +1738,20 @@ static struct dentry *lookup_fast(struct nameidata *nd)
if (read_seqcount_retry(&parent->d_seq, nd->seq))
return ERR_PTR(-ECHILD);
- status = d_revalidate(dentry, nd->flags);
+ status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
if (likely(status > 0))
return dentry;
if (!try_to_unlazy_next(nd, dentry))
return ERR_PTR(-ECHILD);
if (status == -ECHILD)
/* we'd been told to redo it in non-rcu mode */
- status = d_revalidate(dentry, nd->flags);
+ status = d_revalidate(nd->inode, &nd->last,
+ dentry, nd->flags);
} else {
dentry = __d_lookup(parent, &nd->last);
if (unlikely(!dentry))
return NULL;
- status = d_revalidate(dentry, nd->flags);
+ status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
}
if (unlikely(status <= 0)) {
if (!status)
@@ -1777,7 +1779,7 @@ again:
if (IS_ERR(dentry))
return dentry;
if (unlikely(!d_in_lookup(dentry))) {
- int error = d_revalidate(dentry, flags);
+ int error = d_revalidate(inode, name, dentry, flags);
if (unlikely(error <= 0)) {
if (!error) {
d_invalidate(dentry);
@@ -3575,7 +3577,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
if (d_in_lookup(dentry))
break;
- error = d_revalidate(dentry, nd->flags);
+ error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags);
if (likely(error > 0))
break;
if (error)
diff --git a/fs/namespace.c b/fs/namespace.c
index 4013fbac354a..8f1000f9f3df 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5087,30 +5087,29 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
{
struct vfsmount *mnt = s->mnt;
struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
int err;
- if (sb->s_op->show_options) {
- size_t start = seq->count;
-
- err = security_sb_show_options(seq, sb);
- if (err)
- return err;
+ err = security_sb_show_options(seq, sb);
+ if (err)
+ return err;
+ if (sb->s_op->show_options) {
err = sb->s_op->show_options(seq, mnt->mnt_root);
if (err)
return err;
+ }
- if (unlikely(seq_has_overflowed(seq)))
- return -EAGAIN;
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
- if (seq->count == start)
- return 0;
+ if (seq->count == start)
+ return 0;
- /* skip leading comma */
- memmove(seq->buf + start, seq->buf + start + 1,
- seq->count - start - 1);
- seq->count--;
- }
+ /* skip leading comma */
+ memmove(seq->buf + start, seq->buf + start + 1,
+ seq->count - start - 1);
+ seq->count--;
return 0;
}
@@ -5191,39 +5190,45 @@ static int statmount_string(struct kstatmount *s, u64 flag)
size_t kbufsize;
struct seq_file *seq = &s->seq;
struct statmount *sm = &s->sm;
- u32 start = seq->count;
+ u32 start, *offp;
+
+ /* Reserve an empty string at the beginning for any unset offsets */
+ if (!seq->count)
+ seq_putc(seq, 0);
+
+ start = seq->count;
switch (flag) {
case STATMOUNT_FS_TYPE:
- sm->fs_type = start;
+ offp = &sm->fs_type;
ret = statmount_fs_type(s, seq);
break;
case STATMOUNT_MNT_ROOT:
- sm->mnt_root = start;
+ offp = &sm->mnt_root;
ret = statmount_mnt_root(s, seq);
break;
case STATMOUNT_MNT_POINT:
- sm->mnt_point = start;
+ offp = &sm->mnt_point;
ret = statmount_mnt_point(s, seq);
break;
case STATMOUNT_MNT_OPTS:
- sm->mnt_opts = start;
+ offp = &sm->mnt_opts;
ret = statmount_mnt_opts(s, seq);
break;
case STATMOUNT_OPT_ARRAY:
- sm->opt_array = start;
+ offp = &sm->opt_array;
ret = statmount_opt_array(s, seq);
break;
case STATMOUNT_OPT_SEC_ARRAY:
- sm->opt_sec_array = start;
+ offp = &sm->opt_sec_array;
ret = statmount_opt_sec_array(s, seq);
break;
case STATMOUNT_FS_SUBTYPE:
- sm->fs_subtype = start;
+ offp = &sm->fs_subtype;
statmount_fs_subtype(s, seq);
break;
case STATMOUNT_SB_SOURCE:
- sm->sb_source = start;
+ offp = &sm->sb_source;
ret = statmount_sb_source(s, seq);
break;
default:
@@ -5251,6 +5256,7 @@ static int statmount_string(struct kstatmount *s, u64 flag)
seq->buf[seq->count++] = '\0';
sm->mask |= flag;
+ *offp = start;
return 0;
}
@@ -5985,7 +5991,7 @@ const struct proc_ns_operations mntns_operations = {
};
#ifdef CONFIG_SYSCTL
-static struct ctl_table fs_namespace_sysctls[] = {
+static const struct ctl_table fs_namespace_sysctls[] = {
{
.procname = "mount-max",
.data = &sysctl_mount_max,
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index f761d44b3436..0d1b6d35ff3b 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -155,8 +155,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
netfs_cache_read_terminated, subreq);
}
-static void netfs_issue_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
+static void netfs_queue_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq,
+ bool last_subreq)
{
struct netfs_io_stream *stream = &rreq->io_streams[0];
@@ -177,8 +178,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq,
}
}
+ if (last_subreq) {
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+ }
+
spin_unlock(&rreq->lock);
+}
+static void netfs_issue_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
switch (subreq->source) {
case NETFS_DOWNLOAD_FROM_SERVER:
rreq->netfs_ops->issue_read(subreq);
@@ -293,11 +303,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
}
size -= slice;
start += slice;
- if (size <= 0) {
- smp_wmb(); /* Write lists before ALL_QUEUED. */
- set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
- }
+ netfs_queue_read(rreq, subreq, size <= 0);
netfs_issue_read(rreq, subreq);
cond_resched();
} while (size > 0);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index eb76f98c894b..1c4f953c3d68 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -135,6 +135,8 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_rh_retry_read_req;
+extern atomic_t netfs_n_rh_retry_read_subreq;
extern atomic_t netfs_n_wh_buffered_write;
extern atomic_t netfs_n_wh_writethrough;
extern atomic_t netfs_n_wh_dio_write;
@@ -147,6 +149,8 @@ extern atomic_t netfs_n_wh_upload_failed;
extern atomic_t netfs_n_wh_write;
extern atomic_t netfs_n_wh_write_done;
extern atomic_t netfs_n_wh_write_failed;
+extern atomic_t netfs_n_wh_retry_write_req;
+extern atomic_t netfs_n_wh_retry_write_subreq;
extern atomic_t netfs_n_wb_lock_skip;
extern atomic_t netfs_n_wb_lock_wait;
extern atomic_t netfs_n_folioq;
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index f65affa5a9e4..636cc5a98ef5 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -470,7 +470,8 @@ void netfs_read_collection_worker(struct work_struct *work)
*/
void netfs_wake_read_collector(struct netfs_io_request *rreq)
{
- if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
+ if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) &&
+ !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) {
if (!work_pending(&rreq->work)) {
netfs_get_request(rreq, netfs_rreq_trace_get_work);
if (!queue_work(system_unbound_wq, &rreq->work))
@@ -586,7 +587,8 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */
/* If we are at the head of the queue, wake up the collector. */
- if (list_is_first(&subreq->rreq_link, &stream->subrequests))
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests) ||
+ test_bit(NETFS_RREQ_RETRYING, &rreq->flags))
netfs_wake_read_collector(rreq);
netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated);
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index 2290af0d51ac..0f294b26e08c 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -14,7 +14,7 @@ static void netfs_reissue_read(struct netfs_io_request *rreq,
{
__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_stat(&netfs_n_rh_retry_read_subreq);
subreq->rreq->netfs_ops->issue_read(subreq);
}
@@ -48,6 +48,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
subreq->retry_count++;
netfs_reset_iter(subreq);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_read(rreq, subreq);
}
}
@@ -75,7 +76,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
struct iov_iter source;
unsigned long long start, len;
size_t part;
- bool boundary = false;
+ bool boundary = false, subreq_superfluous = false;
/* Go through the subreqs and find the next span of contiguous
* buffer that we then rejig (cifs, for example, needs the
@@ -116,8 +117,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
/* Work through the sublist. */
subreq = from;
list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
- if (!len)
+ if (!len) {
+ subreq_superfluous = true;
break;
+ }
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start - subreq->transferred;
subreq->len = len + subreq->transferred;
@@ -154,19 +157,21 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_read(rreq, subreq);
- if (subreq == to)
+ if (subreq == to) {
+ subreq_superfluous = false;
break;
+ }
}
/* If we managed to use fewer subreqs, we can discard the
* excess; if we used the same number, then we're done.
*/
if (!len) {
- if (subreq == to)
+ if (!subreq_superfluous)
continue;
list_for_each_entry_safe_from(subreq, tmp,
&stream->subrequests, rreq_link) {
- trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous);
list_del(&subreq->rreq_link);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
if (subreq == to)
@@ -187,14 +192,12 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start;
subreq->len = len;
- subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
subreq->stream_nr = stream->stream_nr;
subreq->retry_count = 1;
trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
netfs_sreq_trace_new);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
list_add(&subreq->rreq_link, &to->rreq_link);
to = list_next_entry(to, rreq_link);
@@ -256,14 +259,34 @@ void netfs_retry_reads(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream = &rreq->io_streams[0];
+ DEFINE_WAIT(myself);
+
+ netfs_stat(&netfs_n_rh_retry_read_req);
+
+ set_bit(NETFS_RREQ_RETRYING, &rreq->flags);
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/
list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
- wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
+ if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags))
+ continue;
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
+ for (;;) {
+ prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+ if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags))
+ break;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for);
+ schedule();
+ trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
+ }
+
+ finish_wait(&rreq->waitq, &myself);
}
+ clear_bit(NETFS_RREQ_RETRYING, &rreq->flags);
trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
netfs_retry_read_subrequests(rreq);
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index f1af344266cc..ab6b916addc4 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -29,6 +29,8 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_rh_retry_read_req;
+atomic_t netfs_n_rh_retry_read_subreq;
atomic_t netfs_n_wh_buffered_write;
atomic_t netfs_n_wh_writethrough;
atomic_t netfs_n_wh_dio_write;
@@ -41,6 +43,8 @@ atomic_t netfs_n_wh_upload_failed;
atomic_t netfs_n_wh_write;
atomic_t netfs_n_wh_write_done;
atomic_t netfs_n_wh_write_failed;
+atomic_t netfs_n_wh_retry_write_req;
+atomic_t netfs_n_wh_retry_write_subreq;
atomic_t netfs_n_wb_lock_skip;
atomic_t netfs_n_wb_lock_wait;
atomic_t netfs_n_folioq;
@@ -81,6 +85,11 @@ int netfs_stats_show(struct seq_file *m, void *v)
atomic_read(&netfs_n_wh_write),
atomic_read(&netfs_n_wh_write_done),
atomic_read(&netfs_n_wh_write_failed));
+ seq_printf(m, "Retries: rq=%u rs=%u wq=%u ws=%u\n",
+ atomic_read(&netfs_n_rh_retry_read_req),
+ atomic_read(&netfs_n_rh_retry_read_subreq),
+ atomic_read(&netfs_n_wh_retry_write_req),
+ atomic_read(&netfs_n_wh_retry_write_subreq));
seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n",
atomic_read(&netfs_n_rh_rreq),
atomic_read(&netfs_n_rh_sreq),
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 69727411683e..77279fc5b5a7 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -253,6 +253,7 @@ void netfs_reissue_write(struct netfs_io_stream *stream,
subreq->retry_count++;
__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ netfs_stat(&netfs_n_wh_retry_write_subreq);
netfs_do_issue_write(stream, subreq);
}
diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c
index c841a851dd73..545d33079a77 100644
--- a/fs/netfs/write_retry.c
+++ b/fs/netfs/write_retry.c
@@ -203,6 +203,8 @@ void netfs_retry_writes(struct netfs_io_request *wreq)
struct netfs_io_stream *stream;
int s;
+ netfs_stat(&netfs_n_wh_retry_write_req);
+
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 492cffd9d3d8..2b04038b0e40 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1672,7 +1672,7 @@ nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
}
-static int nfs_lookup_revalidate_dentry(struct inode *dir,
+static int nfs_lookup_revalidate_dentry(struct inode *dir, const struct qstr *name,
struct dentry *dentry,
struct inode *inode, unsigned int flags)
{
@@ -1690,7 +1690,7 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir,
goto out;
dir_verifier = nfs_save_change_attribute(dir);
- ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
+ ret = NFS_PROTO(dir)->lookup(dir, dentry, name, fhandle, fattr);
if (ret < 0)
goto out;
@@ -1732,8 +1732,8 @@ out:
* cached dentry and do a new lookup.
*/
static int
-nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
+nfs_do_lookup_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
int error = 0;
@@ -1775,7 +1775,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
if (NFS_STALE(inode))
goto out_bad;
- return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
+ return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags);
out_valid:
return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
out_bad:
@@ -1785,38 +1785,26 @@ out_bad:
}
static int
-__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
- int (*reval)(struct inode *, struct dentry *, unsigned int))
+__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct dentry *parent;
- struct inode *dir;
- int ret;
-
if (flags & LOOKUP_RCU) {
if (dentry->d_fsdata == NFS_FSDATA_BLOCKED)
return -ECHILD;
- parent = READ_ONCE(dentry->d_parent);
- dir = d_inode_rcu(parent);
- if (!dir)
- return -ECHILD;
- ret = reval(dir, dentry, flags);
- if (parent != READ_ONCE(dentry->d_parent))
- return -ECHILD;
} else {
/* Wait for unlink to complete - see unblock_revalidate() */
wait_var_event(&dentry->d_fsdata,
smp_load_acquire(&dentry->d_fsdata)
!= NFS_FSDATA_BLOCKED);
- parent = dget_parent(dentry);
- ret = reval(d_inode(parent), dentry, flags);
- dput(parent);
}
- return ret;
+ return 0;
}
-static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int nfs_lookup_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
- return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
+ if (__nfs_lookup_revalidate(dentry, flags))
+ return -ECHILD;
+ return nfs_do_lookup_revalidate(dir, name, dentry, flags);
}
static void block_revalidate(struct dentry *dentry)
@@ -1982,7 +1970,8 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
dir_verifier = nfs_save_change_attribute(dir);
trace_nfs_lookup_enter(dir, dentry, flags);
- error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
+ error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name,
+ fhandle, fattr);
if (error == -ENOENT) {
if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
dir_verifier = inode_peek_iversion_raw(dir);
@@ -2025,7 +2014,8 @@ void nfs_d_prune_case_insensitive_aliases(struct inode *inode)
EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases);
#if IS_ENABLED(CONFIG_NFS_V4)
-static int nfs4_lookup_revalidate(struct dentry *, unsigned int);
+static int nfs4_lookup_revalidate(struct inode *, const struct qstr *,
+ struct dentry *, unsigned int);
const struct dentry_operations nfs4_dentry_operations = {
.d_revalidate = nfs4_lookup_revalidate,
@@ -2214,11 +2204,14 @@ no_open:
EXPORT_SYMBOL_GPL(nfs_atomic_open);
static int
-nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
+nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
+ if (__nfs_lookup_revalidate(dentry, flags))
+ return -ECHILD;
+
trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
@@ -2254,16 +2247,10 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
reval_dentry:
if (flags & LOOKUP_RCU)
return -ECHILD;
- return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
+ return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags);
full_reval:
- return nfs_do_lookup_revalidate(dir, dentry, flags);
-}
-
-static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
-{
- return __nfs_lookup_revalidate(dentry, flags,
- nfs4_do_lookup_revalidate);
+ return nfs_do_lookup_revalidate(dir, name, dentry, flags);
}
#endif /* CONFIG_NFSV4 */
@@ -2319,7 +2306,8 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
d_drop(dentry);
if (fhandle->size == 0) {
- error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
+ error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name,
+ fhandle, fattr);
if (error)
goto out_error;
}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 2d53574da605..973aed9cc5fe 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -308,7 +308,7 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server)
int err;
/* Look it up again to get its attributes */
- err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry,
+ err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry, &dentry->d_name,
ctx->mntfh, ctx->clone_data.fattr);
dput(parent);
if (err != 0)
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 7359e1a3bd84..0c3bc98cd999 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -192,7 +192,7 @@ __nfs3_proc_lookup(struct inode *dir, const char *name, size_t len,
}
static int
-nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
+nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name,
struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
unsigned short task_flags = 0;
@@ -202,8 +202,7 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
task_flags |= RPC_TASK_TIMEOUT;
dprintk("NFS call lookup %pd2\n", dentry);
- return __nfs3_proc_lookup(dir, dentry->d_name.name,
- dentry->d_name.len, fhandle, fattr,
+ return __nfs3_proc_lookup(dir, name->name, name->len, fhandle, fattr,
task_flags);
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d615d520f8cf..df9669d4ded7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4544,15 +4544,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
}
static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
- struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct dentry *dentry, const struct qstr *name,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
struct nfs_server *server = NFS_SERVER(dir);
int status;
struct nfs4_lookup_arg args = {
.bitmask = server->attr_bitmask,
.dir_fh = NFS_FH(dir),
- .name = &dentry->d_name,
+ .name = name,
};
struct nfs4_lookup_res res = {
.server = server,
@@ -4594,17 +4594,16 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
}
static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
- struct dentry *dentry, struct nfs_fh *fhandle,
- struct nfs_fattr *fattr)
+ struct dentry *dentry, const struct qstr *name,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
struct nfs4_exception exception = {
.interruptible = true,
};
struct rpc_clnt *client = *clnt;
- const struct qstr *name = &dentry->d_name;
int err;
do {
- err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr);
+ err = _nfs4_proc_lookup(client, dir, dentry, name, fhandle, fattr);
trace_nfs4_lookup(dir, name, err);
switch (err) {
case -NFS4ERR_BADNAME:
@@ -4639,13 +4638,13 @@ out:
return err;
}
-static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry,
+static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name,
struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
int status;
struct rpc_clnt *client = NFS_CLIENT(dir);
- status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr);
+ status = nfs4_proc_lookup_common(&client, dir, dentry, name, fhandle, fattr);
if (client != NFS_CLIENT(dir)) {
rpc_shutdown_client(client);
nfs_fixup_secinfo_attributes(fattr);
@@ -4660,7 +4659,8 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct dentry *dentry,
struct rpc_clnt *client = NFS_CLIENT(dir);
int status;
- status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr);
+ status = nfs4_proc_lookup_common(&client, dir, dentry, &dentry->d_name,
+ fhandle, fattr);
if (status < 0)
return ERR_PTR(status);
return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 886a7c4c60b3..d1a92d8f8ba4 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -17,7 +17,7 @@ static const int nfs_set_port_min;
static const int nfs_set_port_max = 65535;
static struct ctl_table_header *nfs4_callback_sysctl_table;
-static struct ctl_table nfs4_cb_sysctls[] = {
+static const struct ctl_table nfs4_cb_sysctls[] = {
{
.procname = "nfs_callback_tcpport",
.data = &nfs_callback_set_tcpport,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 6c09cd090c34..77920a2e3cef 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -153,13 +153,13 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
}
static int
-nfs_proc_lookup(struct inode *dir, struct dentry *dentry,
+nfs_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name,
struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
struct nfs_diropargs arg = {
.fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len
+ .name = name->name,
+ .len = name->len
};
struct nfs_diropok res = {
.fh = fhandle,
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index e645be1a3381..f579df0e8d67 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -14,7 +14,7 @@
static struct ctl_table_header *nfs_callback_sysctl_table;
-static struct ctl_table nfs_cb_sysctls[] = {
+static const struct ctl_table nfs_cb_sysctls[] = {
{
.procname = "nfs_mountpoint_timeout",
.data = &nfs_mountpoint_expiry_timeout,
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 0e552d873eaa..fb9b1656a287 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -446,11 +446,20 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
struct nfsd_file, nf_gc);
struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+ struct svc_serv *serv;
spin_lock(&l->lock);
list_move_tail(&nf->nf_gc, &l->freeme);
spin_unlock(&l->lock);
- svc_wake_up(nn->nfsd_serv);
+
+ /*
+ * The filecache laundrette is shut down after the
+ * nn->nfsd_serv pointer is cleared, but before the
+ * svc_serv is freed.
+ */
+ serv = nn->nfsd_serv;
+ if (serv)
+ svc_wake_up(serv);
}
}
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 4e3be7201b1c..5fb202acb0fd 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -84,6 +84,8 @@ out:
fail:
posix_acl_release(resp->acl_access);
posix_acl_release(resp->acl_default);
+ resp->acl_access = NULL;
+ resp->acl_default = NULL;
goto out;
}
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 5e34e98db969..7b5433bd3019 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -76,6 +76,8 @@ out:
fail:
posix_acl_release(resp->acl_access);
posix_acl_release(resp->acl_default);
+ resp->acl_access = NULL;
+ resp->acl_default = NULL;
goto out;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 50e468bdb8d4..484077200c5d 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -679,7 +679,7 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp,
return status;
status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status);
- if (unlikely(status || cb->cb_seq_status))
+ if (unlikely(status || cb->cb_status))
return status;
if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0)
return -NFSERR_BAD_XDR;
@@ -1583,8 +1583,11 @@ nfsd4_run_cb_work(struct work_struct *work)
nfsd4_process_cb_update(cb);
clnt = clp->cl_cb_client;
- if (!clnt) {
- /* Callback channel broken, or client killed; give up: */
+ if (!clnt || clp->cl_state == NFSD4_COURTESY) {
+ /*
+ * Callback channel broken, client killed or
+ * nfs4_client in courtesy state; give up.
+ */
nfsd41_destroy_cb(cb);
return;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b7a0cfd05401..153eeea2c7c9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4459,10 +4459,11 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
} while (slot && --cnt > 0);
}
+
+out:
seq->maxslots = max(session->se_target_maxslots, seq->maxslots);
seq->target_maxslots = session->se_target_maxslots;
-out:
switch (clp->cl_cb_state) {
case NFSD4_CB_DOWN:
seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 32019751a41e..aef474f1b84b 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -380,8 +380,9 @@ __fh_verify(struct svc_rqst *rqstp,
error = check_nfsd_access(exp, rqstp, may_bypass_gss);
if (error)
goto out;
-
- svc_xprt_set_valid(rqstp->rq_xprt);
+ /* During LOCALIO call to fh_verify will be called with a NULL rqstp */
+ if (rqstp)
+ svc_xprt_set_valid(rqstp->rq_xprt);
/* Finally, check access permissions. */
error = nfsd_permission(cred, exp, dentry, access);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index e8015d24a82c..6613b8fcceb0 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -1186,7 +1186,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (size) {
if (phys && blkphy << blkbits == phys + size) {
/* The current extent goes on */
- size += n << blkbits;
+ size += (u64)n << blkbits;
} else {
/* Terminate the current extent */
ret = fiemap_fill_next_extent(
@@ -1199,14 +1199,14 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
flags = FIEMAP_EXTENT_MERGED;
logical = blkoff << blkbits;
phys = blkphy << blkbits;
- size = n << blkbits;
+ size = (u64)n << blkbits;
}
} else {
/* Start a new extent */
flags = FIEMAP_EXTENT_MERGED;
logical = blkoff << blkbits;
phys = blkphy << blkbits;
- size = n << blkbits;
+ size = (u64)n << blkbits;
}
blkoff += n;
}
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 6004dfdfdf0f..c4cdaf5fa7ed 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -20,7 +20,7 @@
static int dir_notify_enable __read_mostly = 1;
#ifdef CONFIG_SYSCTL
-static struct ctl_table dnotify_sysctls[] = {
+static const struct ctl_table dnotify_sysctls[] = {
{
.procname = "dir-notify-enable",
.data = &dir_notify_enable,
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6ff94e312232..ba3e2d09eb44 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -58,7 +58,7 @@ static int fanotify_max_queued_events __read_mostly;
static long ft_zero = 0;
static long ft_int_max = INT_MAX;
-static struct ctl_table fanotify_table[] = {
+static const struct ctl_table fanotify_table[] = {
{
.procname = "max_user_groups",
.data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 8ee495a58d0a..fae1b6d397ea 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -648,7 +648,7 @@ EXPORT_SYMBOL_GPL(fsnotify);
* Later, fsnotify permission hooks do not check if there are permission event
* watches, but that there were permission event watches at open time.
*/
-void file_set_fsnotify_mode(struct file *file)
+void file_set_fsnotify_mode_from_watchers(struct file *file)
{
struct dentry *dentry = file->f_path.dentry, *parent;
struct super_block *sb = dentry->d_sb;
@@ -665,7 +665,7 @@ void file_set_fsnotify_mode(struct file *file)
*/
if (likely(!fsnotify_sb_has_priority_watchers(sb,
FSNOTIFY_PRIO_CONTENT))) {
- file->f_mode |= FMODE_NONOTIFY_PERM;
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
return;
}
@@ -676,7 +676,7 @@ void file_set_fsnotify_mode(struct file *file)
if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
likely(!fsnotify_sb_has_priority_watchers(sb,
FSNOTIFY_PRIO_PRE_CONTENT))) {
- file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
return;
}
@@ -686,19 +686,25 @@ void file_set_fsnotify_mode(struct file *file)
*/
mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask,
- FSNOTIFY_PRE_CONTENT_EVENTS)))
+ FSNOTIFY_PRE_CONTENT_EVENTS))) {
+ /* Enable pre-content events */
+ file_set_fsnotify_mode(file, 0);
return;
+ }
/* Is parent watching for pre-content events on this file? */
if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
parent = dget_parent(dentry);
p_mask = fsnotify_inode_watches_children(d_inode(parent));
dput(parent);
- if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)
+ if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) {
+ /* Enable pre-content events */
+ file_set_fsnotify_mode(file, 0);
return;
+ }
}
/* Nobody watching for pre-content events from this file */
- file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
}
#endif
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e0c48956608a..b372fb2c56bd 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -58,7 +58,7 @@ struct kmem_cache *inotify_inode_mark_cachep __ro_after_init;
static long it_zero = 0;
static long it_int_max = INT_MAX;
-static struct ctl_table inotify_table[] = {
+static const struct ctl_table inotify_table[] = {
{
.procname = "max_user_instances",
.data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 8d789b017fa9..af94e3737470 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -787,7 +787,8 @@ pack_runs:
if (err)
goto out;
- attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id);
+ attr = mi_find_attr(ni, mi, NULL, type, name, name_len,
+ &le->id);
if (!attr) {
err = -EINVAL;
goto bad_inode;
@@ -1181,7 +1182,7 @@ repack:
goto out;
}
- attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, &le->id);
+ attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, &le->id);
if (!attr) {
err = -EINVAL;
goto out;
@@ -1406,7 +1407,7 @@ int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr,
*/
if (!attr->non_res) {
if (vbo[1] + bytes_per_off > le32_to_cpu(attr->res.data_size)) {
- ntfs_inode_err(&ni->vfs_inode, "is corrupted");
+ _ntfs_bad_inode(&ni->vfs_inode);
return -EINVAL;
}
addr = resident_data(attr);
@@ -1796,7 +1797,7 @@ repack:
goto out;
}
- attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0,
+ attr = mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0,
&le->id);
if (!attr) {
err = -EINVAL;
@@ -2041,8 +2042,8 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
}
/* Look for required attribute. */
- attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL,
- 0, &le->id);
+ attr = mi_find_attr(ni, mi, NULL, ATTR_DATA,
+ NULL, 0, &le->id);
if (!attr) {
err = -EINVAL;
goto out;
@@ -2587,7 +2588,7 @@ int attr_force_nonresident(struct ntfs_inode *ni)
attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi);
if (!attr) {
- ntfs_bad_inode(&ni->vfs_inode, "no data attribute");
+ _ntfs_bad_inode(&ni->vfs_inode);
return -ENOENT;
}
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index fc6a8aa29e3a..b6da80c69ca6 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -512,7 +512,7 @@ out:
ctx->pos = pos;
} else if (err < 0) {
if (err == -EINVAL)
- ntfs_inode_err(dir, "directory corrupted");
+ _ntfs_bad_inode(dir);
ctx->pos = eod;
}
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 8b39d0ce5f28..5df6a0b5add9 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -75,7 +75,7 @@ struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni)
{
const struct ATTRIB *attr;
- attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
+ attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) :
NULL;
}
@@ -89,7 +89,7 @@ struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni)
{
const struct ATTRIB *attr;
- attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
+ attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) :
NULL;
@@ -148,8 +148,10 @@ int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi)
goto out;
err = mi_get(ni->mi.sbi, rno, &r);
- if (err)
+ if (err) {
+ _ntfs_bad_inode(&ni->vfs_inode);
return err;
+ }
ni_add_mi(ni, r);
@@ -201,7 +203,8 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
*mi = &ni->mi;
/* Look for required attribute in primary record. */
- return mi_find_attr(&ni->mi, attr, type, name, name_len, NULL);
+ return mi_find_attr(ni, &ni->mi, attr, type, name, name_len,
+ NULL);
}
/* First look for list entry of required type. */
@@ -217,7 +220,7 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
return NULL;
/* Look for required attribute. */
- attr = mi_find_attr(m, NULL, type, name, name_len, &le->id);
+ attr = mi_find_attr(ni, m, NULL, type, name, name_len, &le->id);
if (!attr)
goto out;
@@ -238,8 +241,7 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
return attr;
out:
- ntfs_inode_err(&ni->vfs_inode, "failed to parse mft record");
- ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+ _ntfs_bad_inode(&ni->vfs_inode);
return NULL;
}
@@ -259,7 +261,7 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
if (mi)
*mi = &ni->mi;
/* Enum attributes in primary record. */
- return mi_enum_attr(&ni->mi, attr);
+ return mi_enum_attr(ni, &ni->mi, attr);
}
/* Get next list entry. */
@@ -275,7 +277,7 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
*mi = mi2;
/* Find attribute in loaded record. */
- return rec_find_attr_le(mi2, le2);
+ return rec_find_attr_le(ni, mi2, le2);
}
/*
@@ -293,7 +295,8 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
if (!ni->attr_list.size) {
if (pmi)
*pmi = &ni->mi;
- return mi_find_attr(&ni->mi, NULL, type, name, name_len, NULL);
+ return mi_find_attr(ni, &ni->mi, NULL, type, name, name_len,
+ NULL);
}
le = al_find_ex(ni, NULL, type, name, name_len, NULL);
@@ -319,7 +322,7 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
if (pmi)
*pmi = mi;
- attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id);
+ attr = mi_find_attr(ni, mi, NULL, type, name, name_len, &le->id);
if (!attr)
return NULL;
@@ -330,6 +333,7 @@ struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
vcn <= le64_to_cpu(attr->nres.evcn))
return attr;
+ _ntfs_bad_inode(&ni->vfs_inode);
return NULL;
}
@@ -398,7 +402,8 @@ int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
int diff;
if (base_only || type == ATTR_LIST || !ni->attr_list.size) {
- attr = mi_find_attr(&ni->mi, NULL, type, name, name_len, id);
+ attr = mi_find_attr(ni, &ni->mi, NULL, type, name, name_len,
+ id);
if (!attr)
return -ENOENT;
@@ -437,7 +442,7 @@ next_le2:
al_remove_le(ni, le);
- attr = mi_find_attr(mi, NULL, type, name, name_len, id);
+ attr = mi_find_attr(ni, mi, NULL, type, name, name_len, id);
if (!attr)
return -ENOENT;
@@ -485,7 +490,7 @@ ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi,
name = le->name;
}
- attr = mi_insert_attr(mi, type, name, name_len, asize, name_off);
+ attr = mi_insert_attr(ni, mi, type, name, name_len, asize, name_off);
if (!attr) {
if (le_added)
al_remove_le(ni, le);
@@ -673,7 +678,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
if (err)
return err;
- attr_list = mi_find_attr(&ni->mi, NULL, ATTR_LIST, NULL, 0, NULL);
+ attr_list = mi_find_attr(ni, &ni->mi, NULL, ATTR_LIST, NULL, 0, NULL);
if (!attr_list)
return 0;
@@ -695,7 +700,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
if (!mi)
return 0;
- attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+ attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le),
le->name_len, &le->id);
if (!attr)
return 0;
@@ -731,7 +736,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
goto out;
}
- attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+ attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le),
le->name_len, &le->id);
if (!attr) {
/* Should never happened, 'cause already checked. */
@@ -740,7 +745,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
asize = le32_to_cpu(attr->size);
/* Insert into primary record. */
- attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le),
+ attr_ins = mi_insert_attr(ni, &ni->mi, le->type, le_name(le),
le->name_len, asize,
le16_to_cpu(attr->name_off));
if (!attr_ins) {
@@ -768,7 +773,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
if (!mi)
continue;
- attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+ attr = mi_find_attr(ni, mi, NULL, le->type, le_name(le),
le->name_len, &le->id);
if (!attr)
continue;
@@ -831,7 +836,7 @@ int ni_create_attr_list(struct ntfs_inode *ni)
free_b = 0;
attr = NULL;
- for (; (attr = mi_enum_attr(&ni->mi, attr)); le = Add2Ptr(le, sz)) {
+ for (; (attr = mi_enum_attr(ni, &ni->mi, attr)); le = Add2Ptr(le, sz)) {
sz = le_size(attr->name_len);
le->type = attr->type;
le->size = cpu_to_le16(sz);
@@ -886,7 +891,7 @@ int ni_create_attr_list(struct ntfs_inode *ni)
u32 asize = le32_to_cpu(b->size);
u16 name_off = le16_to_cpu(b->name_off);
- attr = mi_insert_attr(mi, b->type, Add2Ptr(b, name_off),
+ attr = mi_insert_attr(ni, mi, b->type, Add2Ptr(b, name_off),
b->name_len, asize, name_off);
if (!attr)
goto out;
@@ -909,7 +914,7 @@ int ni_create_attr_list(struct ntfs_inode *ni)
goto out;
}
- attr = mi_insert_attr(&ni->mi, ATTR_LIST, NULL, 0,
+ attr = mi_insert_attr(ni, &ni->mi, ATTR_LIST, NULL, 0,
lsize + SIZEOF_RESIDENT, SIZEOF_RESIDENT);
if (!attr)
goto out;
@@ -993,13 +998,13 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le,
mi = rb_entry(node, struct mft_inode, node);
if (is_mft_data &&
- (mi_enum_attr(mi, NULL) ||
+ (mi_enum_attr(ni, mi, NULL) ||
vbo <= ((u64)mi->rno << sbi->record_bits))) {
/* We can't accept this record 'cause MFT's bootstrapping. */
continue;
}
if (is_mft &&
- mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, NULL)) {
+ mi_find_attr(ni, mi, NULL, ATTR_DATA, NULL, 0, NULL)) {
/*
* This child record already has a ATTR_DATA.
* So it can't accept any other records.
@@ -1008,7 +1013,7 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le,
}
if ((type != ATTR_NAME || name_len) &&
- mi_find_attr(mi, NULL, type, name, name_len, NULL)) {
+ mi_find_attr(ni, mi, NULL, type, name, name_len, NULL)) {
/* Only indexed attributes can share same record. */
continue;
}
@@ -1157,7 +1162,7 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
/* Estimate the result of moving all possible attributes away. */
attr = NULL;
- while ((attr = mi_enum_attr(&ni->mi, attr))) {
+ while ((attr = mi_enum_attr(ni, &ni->mi, attr))) {
if (attr->type == ATTR_STD)
continue;
if (attr->type == ATTR_LIST)
@@ -1175,7 +1180,7 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
attr = NULL;
for (;;) {
- attr = mi_enum_attr(&ni->mi, attr);
+ attr = mi_enum_attr(ni, &ni->mi, attr);
if (!attr) {
/* We should never be here 'cause we have already check this case. */
err = -EINVAL;
@@ -1259,7 +1264,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) {
mi = rb_entry(node, struct mft_inode, node);
- attr = mi_enum_attr(mi, NULL);
+ attr = mi_enum_attr(ni, mi, NULL);
if (!attr) {
mft_min = mi->rno;
@@ -1280,7 +1285,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
ni_remove_mi(ni, mi_new);
}
- attr = mi_find_attr(&ni->mi, NULL, ATTR_DATA, NULL, 0, NULL);
+ attr = mi_find_attr(ni, &ni->mi, NULL, ATTR_DATA, NULL, 0, NULL);
if (!attr) {
err = -EINVAL;
goto out;
@@ -1397,7 +1402,7 @@ int ni_expand_list(struct ntfs_inode *ni)
continue;
/* Find attribute in primary record. */
- attr = rec_find_attr_le(&ni->mi, le);
+ attr = rec_find_attr_le(ni, &ni->mi, le);
if (!attr) {
err = -EINVAL;
goto out;
@@ -1604,8 +1609,8 @@ int ni_delete_all(struct ntfs_inode *ni)
roff = le16_to_cpu(attr->nres.run_off);
if (roff > asize) {
- _ntfs_bad_inode(&ni->vfs_inode);
- return -EINVAL;
+ /* ni_enum_attr_ex checks this case. */
+ continue;
}
/* run==1 means unpack and deallocate. */
@@ -2726,9 +2731,10 @@ int ni_write_frame(struct ntfs_inode *ni, struct page **pages,
{
int err;
struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct folio *folio = page_folio(pages[0]);
u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
u32 frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT;
- u64 frame_vbo = (u64)pages[0]->index << PAGE_SHIFT;
+ u64 frame_vbo = folio_pos(folio);
CLST frame = frame_vbo >> frame_bits;
char *frame_ondisk = NULL;
struct page **pages_disk = NULL;
@@ -3343,7 +3349,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
if (!mi->dirty)
continue;
- is_empty = !mi_enum_attr(mi, NULL);
+ is_empty = !mi_enum_attr(ni, mi, NULL);
if (is_empty)
clear_rec_inuse(mi->mrec);
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 03471bc9371c..938d351ebac7 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -908,7 +908,11 @@ void ntfs_bad_inode(struct inode *inode, const char *hint)
ntfs_inode_err(inode, "%s", hint);
make_bad_inode(inode);
- ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ /* Avoid recursion if bad inode is $Volume. */
+ if (inode->i_ino != MFT_REC_VOL &&
+ !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING)) {
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ }
}
/*
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 9089c58a005c..7eb9fae22f8d 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1094,8 +1094,7 @@ int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn,
ok:
if (!index_buf_check(ib, bytes, &vbn)) {
- ntfs_inode_err(&ni->vfs_inode, "directory corrupted");
- ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+ _ntfs_bad_inode(&ni->vfs_inode);
err = -EINVAL;
goto out;
}
@@ -1117,8 +1116,7 @@ ok:
out:
if (err == -E_NTFS_CORRUPT) {
- ntfs_inode_err(&ni->vfs_inode, "directory corrupted");
- ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+ _ntfs_bad_inode(&ni->vfs_inode);
err = -EINVAL;
}
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index be04d2845bb7..a1e11228dafd 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -410,6 +410,9 @@ end_enum:
if (!std5)
goto out;
+ if (is_bad_inode(inode))
+ goto out;
+
if (!is_match && name) {
err = -ENOENT;
goto out;
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index cd8e8374bb5a..382820464dee 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -745,23 +745,24 @@ int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi);
void mi_put(struct mft_inode *mi);
int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno);
int mi_read(struct mft_inode *mi, bool is_mft);
-struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr);
-// TODO: id?
-struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr,
- enum ATTR_TYPE type, const __le16 *name,
- u8 name_len, const __le16 *id);
-static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec,
+struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+ struct ATTRIB *attr);
+struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+ struct ATTRIB *attr, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, const __le16 *id);
+static inline struct ATTRIB *rec_find_attr_le(struct ntfs_inode *ni,
+ struct mft_inode *rec,
struct ATTR_LIST_ENTRY *le)
{
- return mi_find_attr(rec, NULL, le->type, le_name(le), le->name_len,
+ return mi_find_attr(ni, rec, NULL, le->type, le_name(le), le->name_len,
&le->id);
}
int mi_write(struct mft_inode *mi, int wait);
int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
__le16 flags, bool is_mft);
-struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
- const __le16 *name, u8 name_len, u32 asize,
- u16 name_off);
+struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+ enum ATTR_TYPE type, const __le16 *name,
+ u8 name_len, u32 asize, u16 name_off);
bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
struct ATTRIB *attr);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 61d53d39f3b9..714c7ecedca8 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -31,7 +31,7 @@ static inline int compare_attr(const struct ATTRIB *left, enum ATTR_TYPE type,
*
* Return: Unused attribute id that is less than mrec->next_attr_id.
*/
-static __le16 mi_new_attt_id(struct mft_inode *mi)
+static __le16 mi_new_attt_id(struct ntfs_inode *ni, struct mft_inode *mi)
{
u16 free_id, max_id, t16;
struct MFT_REC *rec = mi->mrec;
@@ -52,7 +52,7 @@ static __le16 mi_new_attt_id(struct mft_inode *mi)
attr = NULL;
for (;;) {
- attr = mi_enum_attr(mi, attr);
+ attr = mi_enum_attr(ni, mi, attr);
if (!attr) {
rec->next_attr_id = cpu_to_le16(max_id + 1);
mi->dirty = true;
@@ -195,7 +195,8 @@ out:
* NOTE: mi->mrec - memory of size sbi->record_size
* here we sure that mi->mrec->total == sbi->record_size (see mi_read)
*/
-struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
+struct ATTRIB *mi_enum_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+ struct ATTRIB *attr)
{
const struct MFT_REC *rec = mi->mrec;
u32 used = le32_to_cpu(rec->used);
@@ -209,11 +210,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
off = le16_to_cpu(rec->attr_off);
if (used > total)
- return NULL;
+ goto out;
if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 ||
!IS_ALIGNED(off, 8)) {
- return NULL;
+ goto out;
}
/* Skip non-resident records. */
@@ -243,7 +244,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
*/
if (off + 8 > used) {
static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8);
- return NULL;
+ goto out;
}
if (attr->type == ATTR_END) {
@@ -254,112 +255,116 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
/* 0x100 is last known attribute for now. */
t32 = le32_to_cpu(attr->type);
if (!t32 || (t32 & 0xf) || (t32 > 0x100))
- return NULL;
+ goto out;
/* attributes in record must be ordered by type */
if (t32 < prev_type)
- return NULL;
+ goto out;
asize = le32_to_cpu(attr->size);
if (!IS_ALIGNED(asize, 8))
- return NULL;
+ goto out;
/* Check overflow and boundary. */
if (off + asize < off || off + asize > used)
- return NULL;
+ goto out;
/* Can we use the field attr->non_res. */
if (off + 9 > used)
- return NULL;
+ goto out;
/* Check size of attribute. */
if (!attr->non_res) {
/* Check resident fields. */
if (asize < SIZEOF_RESIDENT)
- return NULL;
+ goto out;
t16 = le16_to_cpu(attr->res.data_off);
if (t16 > asize)
- return NULL;
+ goto out;
if (le32_to_cpu(attr->res.data_size) > asize - t16)
- return NULL;
+ goto out;
t32 = sizeof(short) * attr->name_len;
if (t32 && le16_to_cpu(attr->name_off) + t32 > t16)
- return NULL;
+ goto out;
return attr;
}
/* Check nonresident fields. */
if (attr->non_res != 1)
- return NULL;
+ goto out;
/* Can we use memory including attr->nres.valid_size? */
if (asize < SIZEOF_NONRESIDENT)
- return NULL;
+ goto out;
t16 = le16_to_cpu(attr->nres.run_off);
if (t16 > asize)
- return NULL;
+ goto out;
t32 = sizeof(short) * attr->name_len;
if (t32 && le16_to_cpu(attr->name_off) + t32 > t16)
- return NULL;
+ goto out;
/* Check start/end vcn. */
if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1)
- return NULL;
+ goto out;
data_size = le64_to_cpu(attr->nres.data_size);
if (le64_to_cpu(attr->nres.valid_size) > data_size)
- return NULL;
+ goto out;
alloc_size = le64_to_cpu(attr->nres.alloc_size);
if (data_size > alloc_size)
- return NULL;
+ goto out;
t32 = mi->sbi->cluster_mask;
if (alloc_size & t32)
- return NULL;
+ goto out;
if (!attr->nres.svcn && is_attr_ext(attr)) {
/* First segment of sparse/compressed attribute */
/* Can we use memory including attr->nres.total_size? */
if (asize < SIZEOF_NONRESIDENT_EX)
- return NULL;
+ goto out;
tot_size = le64_to_cpu(attr->nres.total_size);
if (tot_size & t32)
- return NULL;
+ goto out;
if (tot_size > alloc_size)
- return NULL;
+ goto out;
} else {
if (attr->nres.c_unit)
- return NULL;
+ goto out;
if (alloc_size > mi->sbi->volume.size)
- return NULL;
+ goto out;
}
return attr;
+
+out:
+ _ntfs_bad_inode(&ni->vfs_inode);
+ return NULL;
}
/*
* mi_find_attr - Find the attribute by type and name and id.
*/
-struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr,
- enum ATTR_TYPE type, const __le16 *name,
- u8 name_len, const __le16 *id)
+struct ATTRIB *mi_find_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+ struct ATTRIB *attr, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, const __le16 *id)
{
u32 type_in = le32_to_cpu(type);
u32 atype;
next_attr:
- attr = mi_enum_attr(mi, attr);
+ attr = mi_enum_attr(ni, mi, attr);
if (!attr)
return NULL;
@@ -467,9 +472,9 @@ int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
*
* Return: Not full constructed attribute or NULL if not possible to create.
*/
-struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
- const __le16 *name, u8 name_len, u32 asize,
- u16 name_off)
+struct ATTRIB *mi_insert_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+ enum ATTR_TYPE type, const __le16 *name,
+ u8 name_len, u32 asize, u16 name_off)
{
size_t tail;
struct ATTRIB *attr;
@@ -488,7 +493,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
* at which we should insert it.
*/
attr = NULL;
- while ((attr = mi_enum_attr(mi, attr))) {
+ while ((attr = mi_enum_attr(ni, mi, attr))) {
int diff = compare_attr(attr, type, name, name_len, upcase);
if (diff < 0)
@@ -508,7 +513,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
tail = used - PtrOffset(rec, attr);
}
- id = mi_new_attt_id(mi);
+ id = mi_new_attt_id(ni, mi);
memmove(Add2Ptr(attr, asize), attr, tail);
memset(attr, 0, asize);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index a9b8688aaf30..1873bbbb7e5b 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -32,7 +32,8 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry)
}
-static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int ocfs2_dentry_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
int ret = 0; /* if all else fails, just return false */
@@ -44,8 +45,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
inode = d_inode(dentry);
osb = OCFS2_SB(dentry->d_sb);
- trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
- dentry->d_name.name);
+ trace_ocfs2_dentry_revalidate(dentry, name->len, name->name);
/* For a negative dentry -
* check the generation number of the parent and compare with the
@@ -53,12 +53,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
*/
if (inode == NULL) {
unsigned long gen = (unsigned long) dentry->d_fsdata;
- unsigned long pgen;
- spin_lock(&dentry->d_lock);
- pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen;
- spin_unlock(&dentry->d_lock);
- trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
- dentry->d_name.name,
+ unsigned long pgen = OCFS2_I(dir)->ip_dir_lock_gen;
+ trace_ocfs2_dentry_revalidate_negative(name->len, name->name,
pgen, gen);
if (gen != pgen)
goto bail;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 20aa37b67cfb..ddd761cf44c8 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -650,7 +650,7 @@ error:
* and easier to preserve the name.
*/
-static struct ctl_table ocfs2_nm_table[] = {
+static const struct ctl_table ocfs2_nm_table[] = {
{
.procname = "hb_ctl_path",
.data = ocfs2_hb_ctl_path,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e0b91dbaa0ac..8bb5022f3082 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2285,7 +2285,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
mlog(ML_ERROR, "found superblock with incorrect block "
"size bits: found %u, should be 9, 10, 11, or 12\n",
blksz_bits);
- } else if ((1 << le32_to_cpu(blksz_bits)) != blksz) {
+ } else if ((1 << blksz_bits) != blksz) {
mlog(ML_ERROR, "found superblock with incorrect block "
"size: found %u, should be %u\n", 1 << blksz_bits, blksz);
} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
diff --git a/fs/open.c b/fs/open.c
index 932e5a6de63b..1be20de9f283 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -905,7 +905,8 @@ static int do_dentry_open(struct file *f,
f->f_sb_err = file_sample_sb_err(f);
if (unlikely(f->f_flags & O_PATH)) {
- f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY;
+ f->f_mode = FMODE_PATH | FMODE_OPENED;
+ file_set_fsnotify_mode(f, FMODE_NONOTIFY);
f->f_op = &empty_fops;
return 0;
}
@@ -935,10 +936,10 @@ static int do_dentry_open(struct file *f,
/*
* Set FMODE_NONOTIFY_* bits according to existing permission watches.
- * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't
- * change anything.
+ * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a
+ * pseudo file, this call will not change the mode.
*/
- file_set_fsnotify_mode(f);
+ file_set_fsnotify_mode_from_watchers(f);
error = fsnotify_open_perm(f);
if (error)
goto cleanup_all;
@@ -1122,7 +1123,7 @@ struct file *dentry_open_nonotify(const struct path *path, int flags,
if (!IS_ERR(f)) {
int error;
- f->f_mode |= FMODE_NONOTIFY;
+ file_set_fsnotify_mode(f, FMODE_NONOTIFY);
error = vfs_open(path, f);
if (error) {
fput(f);
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index 395a00ed8ac7..a19d1ad705db 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -13,10 +13,9 @@
#include "orangefs-kernel.h"
/* Returns 1 if dentry can still be trusted, else 0. */
-static int orangefs_revalidate_lookup(struct dentry *dentry)
+static int orangefs_revalidate_lookup(struct inode *parent_inode, const struct qstr *name,
+ struct dentry *dentry)
{
- struct dentry *parent_dentry = dget_parent(dentry);
- struct inode *parent_inode = parent_dentry->d_inode;
struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode);
struct inode *inode = dentry->d_inode;
struct orangefs_kernel_op_s *new_op;
@@ -26,14 +25,14 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
- if (!new_op) {
- ret = -ENOMEM;
- goto out_put_parent;
- }
+ if (!new_op)
+ return -ENOMEM;
new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
new_op->upcall.req.lookup.parent_refn = parent->refn;
- strscpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name);
+ /* op_alloc() leaves ->upcall zeroed */
+ memcpy(new_op->upcall.req.lookup.d_name, name->name,
+ min(name->len, ORANGEFS_NAME_MAX - 1));
gossip_debug(GOSSIP_DCACHE_DEBUG,
"%s:%s:%d interrupt flag [%d]\n",
@@ -78,8 +77,6 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
ret = 1;
out_release_op:
op_release(new_op);
-out_put_parent:
- dput(parent_dentry);
return ret;
out_drop:
gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n",
@@ -92,7 +89,8 @@ out_drop:
*
* Should return 1 if dentry can still be trusted, else 0.
*/
-static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int orangefs_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
int ret;
unsigned long time = (unsigned long) dentry->d_fsdata;
@@ -114,7 +112,7 @@ static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
* If this passes, the positive dentry still exists or the negative
* dentry still does not exist.
*/
- if (!orangefs_revalidate_lookup(dentry))
+ if (!orangefs_revalidate_lookup(dir, name, dentry))
return 0;
/* We do not need to continue with negative dentries. */
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 9729f071c5aa..f52073022fae 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -392,9 +392,9 @@ static ssize_t orangefs_debug_write(struct file *file,
* Thwart users who try to jamb a ridiculous number
* of bytes into the debug file...
*/
- if (count > ORANGEFS_MAX_DEBUG_STRING_LEN + 1) {
+ if (count > ORANGEFS_MAX_DEBUG_STRING_LEN) {
silly = count;
- count = ORANGEFS_MAX_DEBUG_STRING_LEN + 1;
+ count = ORANGEFS_MAX_DEBUG_STRING_LEN;
}
buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index cea820cb3b55..be5c65d6f848 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -14,8 +14,6 @@
#include <linux/exportfs.h>
#include "overlayfs.h"
-#include "../internal.h" /* for vfs_path_lookup */
-
struct ovl_lookup_data {
struct super_block *sb;
const struct ovl_layer *layer;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index fe511192f83c..86ae6f6da36b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -91,7 +91,24 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE)
ret = d->d_op->d_weak_revalidate(d, flags);
} else if (d->d_flags & DCACHE_OP_REVALIDATE) {
- ret = d->d_op->d_revalidate(d, flags);
+ struct dentry *parent;
+ struct inode *dir;
+ struct name_snapshot n;
+
+ if (flags & LOOKUP_RCU) {
+ parent = READ_ONCE(d->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(d);
+ dir = d_inode(parent);
+ }
+ take_dentry_name_snapshot(&n, d);
+ ret = d->d_op->d_revalidate(dir, &n.name, d, flags);
+ release_dentry_name_snapshot(&n);
+ if (!(flags & LOOKUP_RCU))
+ dput(parent);
if (!ret) {
if (!(flags & LOOKUP_RCU))
d_invalidate(d);
@@ -127,7 +144,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
return ret;
}
-static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int ovl_dentry_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
return ovl_dentry_revalidate_common(dentry, flags, false);
}
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 049352f973de..63f9699ebac3 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -287,7 +287,6 @@ static bool pidfs_ioctl_valid(unsigned int cmd)
switch (cmd) {
case FS_IOC_GETVERSION:
case PIDFD_GET_CGROUP_NAMESPACE:
- case PIDFD_GET_INFO:
case PIDFD_GET_IPC_NAMESPACE:
case PIDFD_GET_MNT_NAMESPACE:
case PIDFD_GET_NET_NAMESPACE:
@@ -300,6 +299,17 @@ static bool pidfs_ioctl_valid(unsigned int cmd)
return true;
}
+ /* Extensible ioctls require some more careful checks. */
+ switch (_IOC_NR(cmd)) {
+ case _IOC_NR(PIDFD_GET_INFO):
+ /*
+ * Try to prevent performing a pidfd ioctl when someone
+ * erronously mistook the file descriptor for a pidfd.
+ * This is not perfect but will catch most cases.
+ */
+ return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO));
+ }
+
return false;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 82fede0f2111..ce1af7592780 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -960,6 +960,12 @@ int create_pipe_files(struct file **res, int flags)
res[1] = f;
stream_open(inode, res[0]);
stream_open(inode, res[1]);
+ /*
+ * Disable permission and pre-content events, but enable legacy
+ * inotify events for legacy users.
+ */
+ file_set_fsnotify_mode(res[0], FMODE_NONOTIFY_PERM);
+ file_set_fsnotify_mode(res[1], FMODE_NONOTIFY_PERM);
return 0;
}
@@ -1478,7 +1484,7 @@ static int proc_dopipe_max_size(const struct ctl_table *table, int write,
do_proc_dopipe_max_size_conv, NULL);
}
-static struct ctl_table fs_pipe_sysctls[] = {
+static const struct ctl_table fs_pipe_sysctls[] = {
{
.procname = "pipe-max-size",
.data = &pipe_max_size,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a50b222a5917..cd89e956c322 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2058,7 +2058,8 @@ void pid_update_inode(struct task_struct *task, struct inode *inode)
* performed a setuid(), etc.
*
*/
-static int pid_revalidate(struct dentry *dentry, unsigned int flags)
+static int pid_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct task_struct *task;
@@ -2191,7 +2192,8 @@ static int dname_to_vma_addr(struct dentry *dentry,
return 0;
}
-static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int map_files_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
unsigned long vm_start, vm_end;
bool exact_vma_exists = false;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 24baf23e864f..37aa778d1af7 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -140,7 +140,8 @@ static void tid_fd_update_inode(struct task_struct *task, struct inode *inode,
security_task_to_inode(task, inode);
}
-static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
+static int tid_fd_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct task_struct *task;
struct inode *inode;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index dbe82cf23ee4..8ec90826a49e 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -216,7 +216,8 @@ void proc_free_inum(unsigned int inum)
ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
}
-static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -343,7 +344,8 @@ static const struct file_operations proc_dir_operations = {
.iterate_shared = proc_readdir,
};
-static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
return 0;
}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 27a283d85a6e..cc9d74a06ff0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -884,7 +884,8 @@ static const struct inode_operations proc_sys_dir_operations = {
.getattr = proc_sys_getattr,
};
-static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
+static int proc_sys_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a00120a3c099..10d01eb09c43 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1524,7 +1524,7 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
pr_warn_once("Unexpected adding of device dump\n");
if (vmcore_open) {
ret = -EBUSY;
- goto out_err;
+ goto unlock;
}
list_add_tail(&dump->list, &vmcoredd_list);
@@ -1532,6 +1532,9 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
mutex_unlock(&vmcore_mutex);
return 0;
+unlock:
+ mutex_unlock(&vmcore_mutex);
+
out_err:
vfree(buf);
vfree(dump);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index f9578918cfb2..825c5c2e0962 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2926,7 +2926,7 @@ static int do_proc_dqstats(const struct ctl_table *table, int write,
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
-static struct ctl_table fs_dqstats_table[] = {
+static const struct ctl_table fs_dqstats_table[] = {
{
.procname = "lookups",
.data = &dqstats.stat[DQST_LOOKUPS],
diff --git a/fs/smb/client/asn1.c b/fs/smb/client/asn1.c
index b5724ef9f182..214a44509e7b 100644
--- a/fs/smb/client/asn1.c
+++ b/fs/smb/client/asn1.c
@@ -52,6 +52,8 @@ int cifs_neg_token_init_mech_type(void *context, size_t hdrlen,
server->sec_kerberos = true;
else if (oid == OID_ntlmssp)
server->sec_ntlmssp = true;
+ else if (oid == OID_IAKerb)
+ server->sec_iakerb = true;
else {
char buf[50];
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 28f568b5fc27..bc1c1e9b288a 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -138,11 +138,13 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
dp = description + strlen(description);
- /* for now, only sec=krb5 and sec=mskrb5 are valid */
+ /* for now, only sec=krb5 and sec=mskrb5 and iakerb are valid */
if (server->sec_kerberos)
sprintf(dp, ";sec=krb5");
else if (server->sec_mskerberos)
sprintf(dp, ";sec=mskrb5");
+ else if (server->sec_iakerb)
+ sprintf(dp, ";sec=iakerb");
else {
cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n");
sprintf(dp, ";sec=krb5");
diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index ba79aa2107cc..699a3f76d083 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -1395,7 +1395,7 @@ chown_chgrp_exit:
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen,
- u32 __maybe_unused unused)
+ u32 info)
{
struct smb_ntsd *pntsd = NULL;
unsigned int xid;
@@ -1407,7 +1407,7 @@ struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
xid = get_xid();
rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd,
- pacllen);
+ pacllen, info);
free_xid(xid);
cifs_put_tlink(tlink);
@@ -1419,7 +1419,7 @@ struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
}
static struct smb_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
- const char *path, u32 *pacllen)
+ const char *path, u32 *pacllen, u32 info)
{
struct smb_ntsd *pntsd = NULL;
int oplock = 0;
@@ -1446,9 +1446,12 @@ static struct smb_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
.fid = &fid,
};
+ if (info & SACL_SECINFO)
+ oparms.desired_access |= SYSTEM_SECURITY;
+
rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (!rc) {
- rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen);
+ rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen, info);
CIFSSMBClose(xid, tcon, fid.netfid);
}
@@ -1472,7 +1475,7 @@ struct smb_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
if (inode)
open_file = find_readable_file(CIFS_I(inode), true);
if (!open_file)
- return get_cifs_acl_by_path(cifs_sb, path, pacllen);
+ return get_cifs_acl_by_path(cifs_sb, path, pacllen, info);
pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen, info);
cifsFileInfo_put(open_file);
@@ -1485,7 +1488,7 @@ int set_cifs_acl(struct smb_ntsd *pnntsd, __u32 acllen,
{
int oplock = 0;
unsigned int xid;
- int rc, access_flags;
+ int rc, access_flags = 0;
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -1498,10 +1501,12 @@ int set_cifs_acl(struct smb_ntsd *pnntsd, __u32 acllen,
tcon = tlink_tcon(tlink);
xid = get_xid();
- if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
- access_flags = WRITE_OWNER;
- else
- access_flags = WRITE_DAC;
+ if (aclflag & CIFS_ACL_OWNER || aclflag & CIFS_ACL_GROUP)
+ access_flags |= WRITE_OWNER;
+ if (aclflag & CIFS_ACL_SACL)
+ access_flags |= SYSTEM_SECURITY;
+ if (aclflag & CIFS_ACL_DACL)
+ access_flags |= WRITE_DAC;
oparms = (struct cifs_open_parms) {
.tcon = tcon,
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index b800c9f585d8..6a3bd652d251 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -715,6 +715,12 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
cifs_sb->ctx->backupgid));
seq_show_option(s, "reparse",
cifs_reparse_type_str(cifs_sb->ctx->reparse_type));
+ if (cifs_sb->ctx->nonativesocket)
+ seq_puts(s, ",nonativesocket");
+ else
+ seq_puts(s, ",nativesocket");
+ seq_show_option(s, "symlink",
+ cifs_symlink_type_str(get_cifs_symlink_type(cifs_sb)));
seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index a762dbbbd959..831fee962c4d 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -146,6 +146,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 52
-#define CIFS_VERSION "2.52"
+#define SMB3_PRODUCT_BUILD 53
+#define CIFS_VERSION "2.53"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 49ffc040f736..cddeb2adbf4a 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -151,6 +151,7 @@ enum securityEnum {
NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
Kerberos, /* Kerberos via SPNEGO */
+ IAKerb, /* Kerberos proxy */
};
enum upcall_target_enum {
@@ -160,6 +161,7 @@ enum upcall_target_enum {
};
enum cifs_reparse_type {
+ CIFS_REPARSE_TYPE_NONE,
CIFS_REPARSE_TYPE_NFS,
CIFS_REPARSE_TYPE_WSL,
CIFS_REPARSE_TYPE_DEFAULT = CIFS_REPARSE_TYPE_NFS,
@@ -168,6 +170,8 @@ enum cifs_reparse_type {
static inline const char *cifs_reparse_type_str(enum cifs_reparse_type type)
{
switch (type) {
+ case CIFS_REPARSE_TYPE_NONE:
+ return "none";
case CIFS_REPARSE_TYPE_NFS:
return "nfs";
case CIFS_REPARSE_TYPE_WSL:
@@ -177,6 +181,39 @@ static inline const char *cifs_reparse_type_str(enum cifs_reparse_type type)
}
}
+enum cifs_symlink_type {
+ CIFS_SYMLINK_TYPE_DEFAULT,
+ CIFS_SYMLINK_TYPE_NONE,
+ CIFS_SYMLINK_TYPE_NATIVE,
+ CIFS_SYMLINK_TYPE_UNIX,
+ CIFS_SYMLINK_TYPE_MFSYMLINKS,
+ CIFS_SYMLINK_TYPE_SFU,
+ CIFS_SYMLINK_TYPE_NFS,
+ CIFS_SYMLINK_TYPE_WSL,
+};
+
+static inline const char *cifs_symlink_type_str(enum cifs_symlink_type type)
+{
+ switch (type) {
+ case CIFS_SYMLINK_TYPE_NONE:
+ return "none";
+ case CIFS_SYMLINK_TYPE_NATIVE:
+ return "native";
+ case CIFS_SYMLINK_TYPE_UNIX:
+ return "unix";
+ case CIFS_SYMLINK_TYPE_MFSYMLINKS:
+ return "mfsymlinks";
+ case CIFS_SYMLINK_TYPE_SFU:
+ return "sfu";
+ case CIFS_SYMLINK_TYPE_NFS:
+ return "nfs";
+ case CIFS_SYMLINK_TYPE_WSL:
+ return "wsl";
+ default:
+ return "unknown";
+ }
+}
+
struct session_key {
unsigned int len;
char *response;
@@ -215,10 +252,8 @@ struct cifs_cred {
struct cifs_open_info_data {
bool adjust_tz;
- union {
- bool reparse_point;
- bool symlink;
- };
+ bool reparse_point;
+ bool contains_posix_file_info;
struct {
/* ioctl response buffer */
struct {
@@ -226,10 +261,7 @@ struct cifs_open_info_data {
struct kvec iov;
} io;
__u32 tag;
- union {
- struct reparse_data_buffer *buf;
- struct reparse_posix_data *posix;
- };
+ struct reparse_data_buffer *buf;
} reparse;
struct {
__u8 eas[SMB2_WSL_MAX_QUERY_EA_RESP_SIZE];
@@ -326,7 +358,7 @@ struct smb_version_operations {
int (*handle_cancelled_mid)(struct mid_q_entry *, struct TCP_Server_Info *);
void (*downgrade_oplock)(struct TCP_Server_Info *server,
struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache);
+ __u16 epoch, bool *purge_cache);
/* process transaction2 response */
bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
char *, int);
@@ -521,12 +553,12 @@ struct smb_version_operations {
/* if we can do cache read operations */
bool (*is_read_op)(__u32);
/* set oplock level for the inode */
- void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int,
- bool *);
+ void (*set_oplock_level)(struct cifsInodeInfo *cinode, __u32 oplock, __u16 epoch,
+ bool *purge_cache);
/* create lease context buffer for CREATE request */
char * (*create_lease_buf)(u8 *lease_key, u8 oplock);
/* parse lease context buffer and return oplock/epoch info */
- __u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey);
+ __u8 (*parse_lease_buf)(void *buf, __u16 *epoch, char *lkey);
ssize_t (*copychunk_range)(const unsigned int,
struct cifsFileInfo *src_file,
struct cifsFileInfo *target_file,
@@ -751,6 +783,7 @@ struct TCP_Server_Info {
bool sec_kerberosu2u; /* supports U2U Kerberos */
bool sec_kerberos; /* supports plain Kerberos */
bool sec_mskerberos; /* supports legacy MS Kerberos */
+ bool sec_iakerb; /* supports pass-through auth for Kerberos (krb5 proxy) */
bool large_buf; /* is current buffer large? */
/* use SMBD connection instead of socket */
bool rdma;
@@ -1415,7 +1448,7 @@ struct cifs_fid {
__u8 create_guid[16];
__u32 access;
struct cifs_pending_open *pending_open;
- unsigned int epoch;
+ __u16 epoch;
#ifdef CONFIG_CIFS_DEBUG2
__u64 mid;
#endif /* CIFS_DEBUG2 */
@@ -1448,7 +1481,7 @@ struct cifsFileInfo {
bool oplock_break_cancelled:1;
bool status_file_deleted:1; /* file has been deleted */
bool offload:1; /* offload final part of _put to a wq */
- unsigned int oplock_epoch; /* epoch from the lease break */
+ __u16 oplock_epoch; /* epoch from the lease break */
__u32 oplock_level; /* oplock/lease level from the lease break */
int count;
spinlock_t file_info_lock; /* protects four flag/count fields above */
@@ -1476,7 +1509,6 @@ struct cifs_io_parms {
struct cifs_io_request {
struct netfs_io_request rreq;
struct cifsFileInfo *cfile;
- struct TCP_Server_Info *server;
pid_t pid;
};
@@ -1545,7 +1577,7 @@ struct cifsInodeInfo {
spinlock_t open_file_lock; /* protects openFileList */
__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
unsigned int oplock; /* oplock/lease level we have */
- unsigned int epoch; /* used to track lease state changes */
+ __u16 epoch; /* used to track lease state changes */
#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */
#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */
#define CIFS_INODE_FLAG_UNUSED (2) /* Unused flag */
@@ -2118,6 +2150,8 @@ static inline char *get_security_type_str(enum securityEnum sectype)
return "Kerberos";
case NTLMv2:
return "NTLMv2";
+ case IAKerb:
+ return "IAKerb";
default:
return "Unknown";
}
@@ -2173,11 +2207,13 @@ static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses)
static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const FILE_ALL_INFO *src)
{
- memcpy(dst, src, (size_t)((u8 *)&src->AccessFlags - (u8 *)src));
- dst->AccessFlags = src->AccessFlags;
- dst->CurrentByteOffset = src->CurrentByteOffset;
- dst->Mode = src->Mode;
- dst->AlignmentRequirement = src->AlignmentRequirement;
+ memcpy(dst, src, (size_t)((u8 *)&src->EASize - (u8 *)src));
+ dst->IndexNumber = 0;
+ dst->EASize = src->EASize;
+ dst->AccessFlags = 0;
+ dst->CurrentByteOffset = 0;
+ dst->Mode = 0;
+ dst->AlignmentRequirement = 0;
dst->FileNameLength = src->FileNameLength;
}
@@ -2289,8 +2325,8 @@ struct smb2_compound_vars {
struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
struct kvec close_iov;
- struct smb2_file_rename_info rename_info;
- struct smb2_file_link_info link_info;
+ struct smb2_file_rename_info_hdr rename_info;
+ struct smb2_file_link_info_hdr link_info;
struct kvec ea_iov;
};
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index 5c047b00516f..48d0d6f439cf 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -190,42 +190,82 @@
*/
#define FILE_READ_DATA 0x00000001 /* Data can be read from the file */
+ /* or directory child entries can */
+ /* be listed together with the */
+ /* associated child attributes */
+ /* (so the FILE_READ_ATTRIBUTES on */
+ /* the child entry is not needed) */
#define FILE_WRITE_DATA 0x00000002 /* Data can be written to the file */
+ /* or new file can be created in */
+ /* the directory */
#define FILE_APPEND_DATA 0x00000004 /* Data can be appended to the file */
+ /* (for non-local files over SMB it */
+ /* is same as FILE_WRITE_DATA) */
+ /* or new subdirectory can be */
+ /* created in the directory */
#define FILE_READ_EA 0x00000008 /* Extended attributes associated */
/* with the file can be read */
#define FILE_WRITE_EA 0x00000010 /* Extended attributes associated */
/* with the file can be written */
#define FILE_EXECUTE 0x00000020 /*Data can be read into memory from */
/* the file using system paging I/O */
-#define FILE_DELETE_CHILD 0x00000040
+ /* for executing the file / script */
+ /* or right to traverse directory */
+ /* (but by default all users have */
+ /* directory bypass traverse */
+ /* privilege and do not need this */
+ /* permission on directories at all)*/
+#define FILE_DELETE_CHILD 0x00000040 /* Child entry can be deleted from */
+ /* the directory (so the DELETE on */
+ /* the child entry is not needed) */
#define FILE_READ_ATTRIBUTES 0x00000080 /* Attributes associated with the */
- /* file can be read */
+ /* file or directory can be read */
#define FILE_WRITE_ATTRIBUTES 0x00000100 /* Attributes associated with the */
- /* file can be written */
-#define DELETE 0x00010000 /* The file can be deleted */
-#define READ_CONTROL 0x00020000 /* The access control list and */
- /* ownership associated with the */
- /* file can be read */
-#define WRITE_DAC 0x00040000 /* The access control list and */
- /* ownership associated with the */
- /* file can be written. */
+ /* file or directory can be written */
+#define DELETE 0x00010000 /* The file or dir can be deleted */
+#define READ_CONTROL 0x00020000 /* The discretionary access control */
+ /* list and ownership associated */
+ /* with the file or dir can be read */
+#define WRITE_DAC 0x00040000 /* The discretionary access control */
+ /* list associated with the file or */
+ /* directory can be written */
#define WRITE_OWNER 0x00080000 /* Ownership information associated */
- /* with the file can be written */
+ /* with the file/dir can be written */
#define SYNCHRONIZE 0x00100000 /* The file handle can waited on to */
/* synchronize with the completion */
/* of an input/output request */
#define SYSTEM_SECURITY 0x01000000 /* The system access control list */
- /* can be read and changed */
-#define GENERIC_ALL 0x10000000
-#define GENERIC_EXECUTE 0x20000000
-#define GENERIC_WRITE 0x40000000
-#define GENERIC_READ 0x80000000
- /* In summary - Relevant file */
- /* access flags from CIFS are */
- /* file_read_data, file_write_data */
- /* file_execute, file_read_attributes*/
- /* write_dac, and delete. */
+ /* associated with the file or */
+ /* directory can be read or written */
+ /* (cannot be in DACL, can in SACL) */
+#define MAXIMUM_ALLOWED 0x02000000 /* Maximal subset of GENERIC_ALL */
+ /* permissions which can be granted */
+ /* (cannot be in DACL nor SACL) */
+#define GENERIC_ALL 0x10000000 /* Same as: GENERIC_EXECUTE | */
+ /* GENERIC_WRITE | */
+ /* GENERIC_READ | */
+ /* FILE_DELETE_CHILD | */
+ /* DELETE | */
+ /* WRITE_DAC | */
+ /* WRITE_OWNER */
+ /* So GENERIC_ALL contains all bits */
+ /* mentioned above except these two */
+ /* SYSTEM_SECURITY MAXIMUM_ALLOWED */
+#define GENERIC_EXECUTE 0x20000000 /* Same as: FILE_EXECUTE | */
+ /* FILE_READ_ATTRIBUTES | */
+ /* READ_CONTROL | */
+ /* SYNCHRONIZE */
+#define GENERIC_WRITE 0x40000000 /* Same as: FILE_WRITE_DATA | */
+ /* FILE_APPEND_DATA | */
+ /* FILE_WRITE_EA | */
+ /* FILE_WRITE_ATTRIBUTES | */
+ /* READ_CONTROL | */
+ /* SYNCHRONIZE */
+#define GENERIC_READ 0x80000000 /* Same as: FILE_READ_DATA | */
+ /* FILE_READ_EA | */
+ /* FILE_READ_ATTRIBUTES | */
+ /* READ_CONTROL | */
+ /* SYNCHRONIZE */
#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES)
#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
@@ -1484,20 +1524,6 @@ struct file_notify_information {
__u8 FileName[];
} __attribute__((packed));
-/* For IO_REPARSE_TAG_NFS */
-#define NFS_SPECFILE_LNK 0x00000000014B4E4C
-#define NFS_SPECFILE_CHR 0x0000000000524843
-#define NFS_SPECFILE_BLK 0x00000000004B4C42
-#define NFS_SPECFILE_FIFO 0x000000004F464946
-#define NFS_SPECFILE_SOCK 0x000000004B434F53
-struct reparse_posix_data {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __le64 InodeType; /* LNK, FIFO, CHR etc. */
- __u8 DataBuffer[];
-} __attribute__((packed));
-
struct cifs_quota_data {
__u32 rsrvd1; /* 0 */
__u32 sid_size;
@@ -2264,13 +2290,7 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */
__u8 DeletePending;
__u8 Directory;
__u16 Pad2;
- __le64 IndexNumber;
__le32 EASize;
- __le32 AccessFlags;
- __u64 IndexNumber1;
- __le64 CurrentByteOffset;
- __le32 Mode;
- __le32 AlignmentRequirement;
__le32 FileNameLength;
union {
char __pad;
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 223e5e231f42..81680001944d 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -557,7 +557,7 @@ extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
const struct nls_table *nls_codepage,
struct cifs_sb_info *cifs_sb);
extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
- __u16 fid, struct smb_ntsd **acl_inf, __u32 *buflen);
+ __u16 fid, struct smb_ntsd **acl_inf, __u32 *buflen, __u32 info);
extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
struct smb_ntsd *pntsd, __u32 len, int aclflag);
extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
@@ -656,7 +656,7 @@ char *extract_sharename(const char *unc);
int parse_reparse_point(struct reparse_data_buffer *buf,
u32 plen, struct cifs_sb_info *cifs_sb,
const char *full_path,
- bool unicode, struct cifs_open_info_data *data);
+ struct cifs_open_info_data *data);
int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev,
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 7f1cacc89dbb..3feaa0f68169 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -3369,7 +3369,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
/* Get Security Descriptor (by handle) from remote server for a file or dir */
int
CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
- struct smb_ntsd **acl_inf, __u32 *pbuflen)
+ struct smb_ntsd **acl_inf, __u32 *pbuflen, __u32 info)
{
int rc = 0;
int buf_type = 0;
@@ -3392,7 +3392,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
pSMB->MaxSetupCount = 0;
pSMB->Fid = fid; /* file handle always le */
pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP |
- CIFS_ACL_DACL);
+ CIFS_ACL_DACL | info);
pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */
inc_rfc1001_len(pSMB, 11);
iov[0].iov_base = (char *)pSMB;
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 880d7cf8b730..f917de020dd5 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -2849,6 +2849,10 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
return 0;
if (old->ctx->reparse_type != new->ctx->reparse_type)
return 0;
+ if (old->ctx->nonativesocket != new->ctx->nonativesocket)
+ return 0;
+ if (old->ctx->symlink_type != new->ctx->symlink_type)
+ return 0;
return 1;
}
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index dad521336b5e..f65a8a90ba27 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -150,25 +150,27 @@ again:
if (rc)
continue;
- if (tgt.flags & DFSREF_STORAGE_SERVER) {
- rc = cifs_mount_get_tcon(mnt_ctx);
- if (!rc)
- rc = cifs_is_path_remote(mnt_ctx);
+ rc = cifs_mount_get_tcon(mnt_ctx);
+ if (rc) {
+ if (tgt.server_type == DFS_TYPE_LINK &&
+ DFS_INTERLINK(tgt.flags))
+ rc = -EREMOTE;
+ } else {
+ rc = cifs_is_path_remote(mnt_ctx);
if (!rc) {
ref_walk_set_tgt_hint(rw);
break;
}
- if (rc != -EREMOTE)
- continue;
}
-
- rc = ref_walk_advance(rw);
- if (!rc) {
- rc = setup_dfs_ref(&tgt, rw);
- if (rc)
- break;
- ref_walk_mark_end(rw);
- goto again;
+ if (rc == -EREMOTE) {
+ rc = ref_walk_advance(rw);
+ if (!rc) {
+ rc = setup_dfs_ref(&tgt, rw);
+ if (rc)
+ break;
+ ref_walk_mark_end(rw);
+ goto again;
+ }
}
}
} while (rc && ref_walk_descend(rw));
diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h
index ed4cd7cf1ec6..e60f0a24a8a1 100644
--- a/fs/smb/client/dfs.h
+++ b/fs/smb/client/dfs.h
@@ -188,4 +188,11 @@ static inline void dfs_put_root_smb_sessions(struct list_head *head)
}
}
+static inline const char *dfs_ses_refpath(struct cifs_ses *ses)
+{
+ const char *path = ses->server->leaf_fullpath;
+
+ return path ? path + 1 : ERR_PTR(-ENOENT);
+}
+
#endif /* _CIFS_DFS_H */
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index 5022bb1f122a..4dada26d56b5 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -1136,33 +1136,19 @@ static bool is_ses_good(struct cifs_ses *ses)
return ret;
}
-static char *get_ses_refpath(struct cifs_ses *ses)
-{
- struct TCP_Server_Info *server = ses->server;
- char *path = ERR_PTR(-ENOENT);
-
- if (server->leaf_fullpath) {
- path = kstrdup(server->leaf_fullpath + 1, GFP_KERNEL);
- if (!path)
- path = ERR_PTR(-ENOMEM);
- }
- return path;
-}
-
/* Refresh dfs referral of @ses */
static void refresh_ses_referral(struct cifs_ses *ses)
{
struct cache_entry *ce;
unsigned int xid;
- char *path;
+ const char *path;
int rc = 0;
xid = get_xid();
- path = get_ses_refpath(ses);
+ path = dfs_ses_refpath(ses);
if (IS_ERR(path)) {
rc = PTR_ERR(path);
- path = NULL;
goto out;
}
@@ -1181,7 +1167,6 @@ static void refresh_ses_referral(struct cifs_ses *ses)
out:
free_xid(xid);
- kfree(path);
}
static int __refresh_tcon_referral(struct cifs_tcon *tcon,
@@ -1231,19 +1216,18 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh)
struct dfs_info3_param *refs = NULL;
struct cache_entry *ce;
struct cifs_ses *ses;
- unsigned int xid;
bool needs_refresh;
- char *path;
+ const char *path;
+ unsigned int xid;
int numrefs = 0;
int rc = 0;
xid = get_xid();
ses = tcon->ses;
- path = get_ses_refpath(ses);
+ path = dfs_ses_refpath(ses);
if (IS_ERR(path)) {
rc = PTR_ERR(path);
- path = NULL;
goto out;
}
@@ -1271,7 +1255,6 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh)
out:
free_xid(xid);
- kfree(path);
free_dfs_info_array(refs, numrefs);
}
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 1822493dd084..d1e95632ac54 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -737,7 +737,8 @@ again:
}
static int
-cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
+cifs_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *direntry, unsigned int flags)
{
struct inode *inode;
int rc;
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 79de2f2f9c41..8582cf61242c 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -147,7 +147,7 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)
struct netfs_io_request *rreq = subreq->rreq;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
- struct TCP_Server_Info *server = req->server;
+ struct TCP_Server_Info *server;
struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
size_t size;
int rc = 0;
@@ -156,6 +156,8 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)
rdata->xid = get_xid();
rdata->have_xid = true;
}
+
+ server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
rdata->server = server;
if (cifs_sb->ctx->rsize == 0)
@@ -198,7 +200,7 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq)
struct netfs_io_request *rreq = subreq->rreq;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
- struct TCP_Server_Info *server = req->server;
+ struct TCP_Server_Info *server = rdata->server;
int rc = 0;
cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n",
@@ -266,7 +268,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file)
open_file = file->private_data;
rreq->netfs_priv = file->private_data;
req->cfile = cifsFileInfo_get(open_file);
- req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
req->pid = req->cfile->pid;
} else if (rreq->origin != NETFS_WRITEBACK) {
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 5381f05420bc..e9b286d9a7ba 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -133,6 +133,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag("rootfs", Opt_rootfs),
fsparam_flag("compress", Opt_compress),
fsparam_flag("witness", Opt_witness),
+ fsparam_flag_no("nativesocket", Opt_nativesocket),
/* Mount options which take uid or gid */
fsparam_uid("backupuid", Opt_backupuid),
@@ -185,6 +186,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_string("cache", Opt_cache),
fsparam_string("reparse", Opt_reparse),
fsparam_string("upcall_target", Opt_upcalltarget),
+ fsparam_string("symlink", Opt_symlink),
+ fsparam_string("symlinkroot", Opt_symlinkroot),
/* Arguments that should be ignored */
fsparam_flag("guest", Opt_ignore),
@@ -332,6 +335,7 @@ cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_conte
static const match_table_t reparse_flavor_tokens = {
{ Opt_reparse_default, "default" },
+ { Opt_reparse_none, "none" },
{ Opt_reparse_nfs, "nfs" },
{ Opt_reparse_wsl, "wsl" },
{ Opt_reparse_err, NULL },
@@ -346,6 +350,9 @@ static int parse_reparse_flavor(struct fs_context *fc, char *value,
case Opt_reparse_default:
ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
break;
+ case Opt_reparse_none:
+ ctx->reparse_type = CIFS_REPARSE_TYPE_NONE;
+ break;
case Opt_reparse_nfs:
ctx->reparse_type = CIFS_REPARSE_TYPE_NFS;
break;
@@ -359,6 +366,55 @@ static int parse_reparse_flavor(struct fs_context *fc, char *value,
return 0;
}
+static const match_table_t symlink_flavor_tokens = {
+ { Opt_symlink_default, "default" },
+ { Opt_symlink_none, "none" },
+ { Opt_symlink_native, "native" },
+ { Opt_symlink_unix, "unix" },
+ { Opt_symlink_mfsymlinks, "mfsymlinks" },
+ { Opt_symlink_sfu, "sfu" },
+ { Opt_symlink_nfs, "nfs" },
+ { Opt_symlink_wsl, "wsl" },
+ { Opt_symlink_err, NULL },
+};
+
+static int parse_symlink_flavor(struct fs_context *fc, char *value,
+ struct smb3_fs_context *ctx)
+{
+ substring_t args[MAX_OPT_ARGS];
+
+ switch (match_token(value, symlink_flavor_tokens, args)) {
+ case Opt_symlink_default:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT;
+ break;
+ case Opt_symlink_none:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_NONE;
+ break;
+ case Opt_symlink_native:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_NATIVE;
+ break;
+ case Opt_symlink_unix:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_UNIX;
+ break;
+ case Opt_symlink_mfsymlinks:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_MFSYMLINKS;
+ break;
+ case Opt_symlink_sfu:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_SFU;
+ break;
+ case Opt_symlink_nfs:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_NFS;
+ break;
+ case Opt_symlink_wsl:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_WSL;
+ break;
+ default:
+ cifs_errorf(fc, "bad symlink= option: %s\n", value);
+ return 1;
+ }
+ return 0;
+}
+
#define DUP_CTX_STR(field) \
do { \
if (ctx->field) { \
@@ -386,6 +442,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->iocharset = NULL;
new_ctx->leaf_fullpath = NULL;
new_ctx->dns_dom = NULL;
+ new_ctx->symlinkroot = NULL;
/*
* Make sure to stay in sync with smb3_cleanup_fs_context_contents()
*/
@@ -401,6 +458,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(iocharset);
DUP_CTX_STR(leaf_fullpath);
DUP_CTX_STR(dns_dom);
+ DUP_CTX_STR(symlinkroot);
return 0;
}
@@ -1727,6 +1785,23 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
if (parse_reparse_flavor(fc, param->string, ctx))
goto cifs_parse_mount_err;
break;
+ case Opt_nativesocket:
+ ctx->nonativesocket = result.negated;
+ break;
+ case Opt_symlink:
+ if (parse_symlink_flavor(fc, param->string, ctx))
+ goto cifs_parse_mount_err;
+ break;
+ case Opt_symlinkroot:
+ if (param->string[0] != '/') {
+ cifs_errorf(fc, "symlinkroot mount options must be absolute path\n");
+ goto cifs_parse_mount_err;
+ }
+ kfree(ctx->symlinkroot);
+ ctx->symlinkroot = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->symlinkroot)
+ goto cifs_parse_mount_err;
+ break;
}
/* case Opt_ignore: - is ignored as expected ... */
@@ -1735,6 +1810,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
goto cifs_parse_mount_err;
}
+ /*
+ * By default resolve all native absolute symlinks relative to "/mnt/".
+ * Same default has drvfs driver running in WSL for resolving SMB shares.
+ */
+ if (!ctx->symlinkroot)
+ ctx->symlinkroot = kstrdup("/mnt/", GFP_KERNEL);
+
return 0;
cifs_parse_mount_err:
@@ -1745,6 +1827,24 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
return -EINVAL;
}
+enum cifs_symlink_type get_cifs_symlink_type(struct cifs_sb_info *cifs_sb)
+{
+ if (cifs_sb->ctx->symlink_type == CIFS_SYMLINK_TYPE_DEFAULT) {
+ if (cifs_sb->ctx->mfsymlinks)
+ return CIFS_SYMLINK_TYPE_MFSYMLINKS;
+ else if (cifs_sb->ctx->sfu_emul)
+ return CIFS_SYMLINK_TYPE_SFU;
+ else if (cifs_sb->ctx->linux_ext && !cifs_sb->ctx->no_linux_ext)
+ return CIFS_SYMLINK_TYPE_UNIX;
+ else if (cifs_sb->ctx->reparse_type != CIFS_REPARSE_TYPE_NONE)
+ return CIFS_SYMLINK_TYPE_NATIVE;
+ else
+ return CIFS_SYMLINK_TYPE_NONE;
+ } else {
+ return cifs_sb->ctx->symlink_type;
+ }
+}
+
int smb3_init_fs_context(struct fs_context *fc)
{
struct smb3_fs_context *ctx;
@@ -1821,6 +1921,8 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->retrans = 1;
ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT;
+ ctx->nonativesocket = 0;
/*
* short int override_uid = -1;
@@ -1867,6 +1969,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->leaf_fullpath = NULL;
kfree(ctx->dns_dom);
ctx->dns_dom = NULL;
+ kfree(ctx->symlinkroot);
+ ctx->symlinkroot = NULL;
}
void
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 8813533345ee..881bfc08667e 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -43,11 +43,24 @@ enum {
enum cifs_reparse_parm {
Opt_reparse_default,
+ Opt_reparse_none,
Opt_reparse_nfs,
Opt_reparse_wsl,
Opt_reparse_err
};
+enum cifs_symlink_parm {
+ Opt_symlink_default,
+ Opt_symlink_none,
+ Opt_symlink_native,
+ Opt_symlink_unix,
+ Opt_symlink_mfsymlinks,
+ Opt_symlink_sfu,
+ Opt_symlink_nfs,
+ Opt_symlink_wsl,
+ Opt_symlink_err
+};
+
enum cifs_sec_param {
Opt_sec_krb5,
Opt_sec_krb5i,
@@ -166,6 +179,9 @@ enum cifs_param {
Opt_cache,
Opt_reparse,
Opt_upcalltarget,
+ Opt_nativesocket,
+ Opt_symlink,
+ Opt_symlinkroot,
/* Mount options to be ignored */
Opt_ignore,
@@ -294,12 +310,17 @@ struct smb3_fs_context {
struct cifs_ses *dfs_root_ses;
bool dfs_automount:1; /* set for dfs automount only */
enum cifs_reparse_type reparse_type;
+ enum cifs_symlink_type symlink_type;
+ bool nonativesocket:1;
bool dfs_conn:1; /* set for dfs mounts */
char *dns_dom;
+ char *symlinkroot; /* top level directory for native SMB symlinks in absolute format */
};
extern const struct fs_parameter_spec smb3_fs_parameters[];
+extern enum cifs_symlink_type get_cifs_symlink_type(struct cifs_sb_info *cifs_sb);
+
extern int smb3_init_fs_context(struct fs_context *fc);
extern void smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx);
extern void smb3_cleanup_fs_context(struct smb3_fs_context *ctx);
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 93e9188b2632..616149c7f0a5 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -990,7 +990,7 @@ cifs_get_file_info(struct file *filp)
/* TODO: add support to query reparse tag */
data.adjust_tz = false;
if (data.symlink_target) {
- data.symlink = true;
+ data.reparse_point = true;
data.reparse.tag = IO_REPARSE_TAG_SYMLINK;
}
path = build_path_from_dentry(dentry, page);
@@ -1215,6 +1215,24 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
rc = server->ops->parse_reparse_point(cifs_sb,
full_path,
iov, data);
+ /*
+ * If the reparse point was not handled but it is the
+ * name surrogate which points to directory, then treat
+ * is as a new mount point. Name surrogate reparse point
+ * represents another named entity in the system.
+ */
+ if (rc == -EOPNOTSUPP &&
+ IS_REPARSE_TAG_NAME_SURROGATE(data->reparse.tag) &&
+ (le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY)) {
+ rc = 0;
+ cifs_create_junction_fattr(fattr, sb);
+ goto out;
+ }
+ }
+
+ if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) {
+ bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY;
+ rc = smb2_fix_symlink_target_type(&data->symlink_target, directory, cifs_sb);
}
break;
}
@@ -1403,7 +1421,7 @@ int cifs_get_inode_info(struct inode **inode,
struct cifs_fattr fattr = {};
int rc;
- if (is_inode_cache_good(*inode)) {
+ if (!data && is_inode_cache_good(*inode)) {
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
return 0;
}
@@ -1502,7 +1520,7 @@ int smb311_posix_get_inode_info(struct inode **inode,
struct cifs_fattr fattr = {};
int rc;
- if (is_inode_cache_good(*inode)) {
+ if (!data && is_inode_cache_good(*inode)) {
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
return 0;
}
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index 47ddeb7fa111..6e6c09cc5ce7 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -18,6 +18,7 @@
#include "cifs_unicode.h"
#include "smb2proto.h"
#include "cifs_ioctl.h"
+#include "fs_context.h"
/*
* M-F Symlink Functions - Begin
@@ -604,22 +605,53 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
cifs_dbg(FYI, "symname is %s\n", symname);
/* BB what if DFS and this volume is on different share? BB */
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
- rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
- } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
- rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon,
- full_path, S_IFLNK, 0, symname);
+ rc = -EOPNOTSUPP;
+ switch (get_cifs_symlink_type(cifs_sb)) {
+ case CIFS_SYMLINK_TYPE_DEFAULT:
+ /* should not happen, get_cifs_symlink_type() resolves the default */
+ break;
+
+ case CIFS_SYMLINK_TYPE_NONE:
+ break;
+
+ case CIFS_SYMLINK_TYPE_UNIX:
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- } else if (pTcon->unix_ext) {
- rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
+ if (pTcon->unix_ext) {
+ rc = CIFSUnixCreateSymLink(xid, pTcon, full_path,
+ symname,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ }
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
- } else if (server->ops->create_reparse_symlink) {
- rc = server->ops->create_reparse_symlink(xid, inode, direntry,
- pTcon, full_path,
- symname);
- goto symlink_exit;
+ break;
+
+ case CIFS_SYMLINK_TYPE_MFSYMLINKS:
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+ rc = create_mf_symlink(xid, pTcon, cifs_sb,
+ full_path, symname);
+ }
+ break;
+
+ case CIFS_SYMLINK_TYPE_SFU:
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+ rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon,
+ full_path, S_IFLNK,
+ 0, symname);
+ }
+ break;
+
+ case CIFS_SYMLINK_TYPE_NATIVE:
+ case CIFS_SYMLINK_TYPE_NFS:
+ case CIFS_SYMLINK_TYPE_WSL:
+ if (server->ops->create_reparse_symlink) {
+ rc = server->ops->create_reparse_symlink(xid, inode,
+ direntry,
+ pTcon,
+ full_path,
+ symname);
+ goto symlink_exit;
+ }
+ break;
}
if (rc == 0) {
diff --git a/fs/smb/client/netmisc.c b/fs/smb/client/netmisc.c
index 17b3e21ea868..9ec20601cee2 100644
--- a/fs/smb/client/netmisc.c
+++ b/fs/smb/client/netmisc.c
@@ -313,7 +313,6 @@ static const struct {
ERRDOS, 2215, NT_STATUS_NO_LOGON_SERVERS}, {
ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_LOGON_SESSION}, {
ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PRIVILEGE}, {
- ERRDOS, ERRnoaccess, NT_STATUS_PRIVILEGE_NOT_HELD}, {
ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACCOUNT_NAME}, {
ERRHRD, ERRgeneral, NT_STATUS_USER_EXISTS},
/* { This NT error code was 'sqashed'
@@ -871,6 +870,15 @@ map_smb_to_linux_error(char *buf, bool logErr)
}
/* else ERRHRD class errors or junk - return EIO */
+ /* special cases for NT status codes which cannot be translated to DOS codes */
+ if (smb->Flags2 & SMBFLG2_ERR_STATUS) {
+ __u32 err = le32_to_cpu(smb->Status.CifsError);
+ if (err == (NT_STATUS_NOT_A_REPARSE_POINT))
+ rc = -ENODATA;
+ else if (err == (NT_STATUS_PRIVILEGE_NOT_HELD))
+ rc = -EPERM;
+ }
+
cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n",
le32_to_cpu(smb->Status.CifsError), rc);
diff --git a/fs/smb/client/nterr.c b/fs/smb/client/nterr.c
index d396a8e98a81..8f0bc441295e 100644
--- a/fs/smb/client/nterr.c
+++ b/fs/smb/client/nterr.c
@@ -674,6 +674,7 @@ const struct nt_err_code_struct nt_errs[] = {
{"NT_STATUS_QUOTA_LIST_INCONSISTENT",
NT_STATUS_QUOTA_LIST_INCONSISTENT},
{"NT_STATUS_FILE_IS_OFFLINE", NT_STATUS_FILE_IS_OFFLINE},
+ {"NT_STATUS_NOT_A_REPARSE_POINT", NT_STATUS_NOT_A_REPARSE_POINT},
{"NT_STATUS_NO_MORE_ENTRIES", NT_STATUS_NO_MORE_ENTRIES},
{"NT_STATUS_MORE_ENTRIES", NT_STATUS_MORE_ENTRIES},
{"NT_STATUS_SOME_UNMAPPED", NT_STATUS_SOME_UNMAPPED},
diff --git a/fs/smb/client/nterr.h b/fs/smb/client/nterr.h
index edd4741cab0a..180602c22355 100644
--- a/fs/smb/client/nterr.h
+++ b/fs/smb/client/nterr.h
@@ -546,6 +546,7 @@ extern const struct nt_err_code_struct nt_errs[];
#define NT_STATUS_TOO_MANY_LINKS 0xC0000000 | 0x0265
#define NT_STATUS_QUOTA_LIST_INCONSISTENT 0xC0000000 | 0x0266
#define NT_STATUS_FILE_IS_OFFLINE 0xC0000000 | 0x0267
+#define NT_STATUS_NOT_A_REPARSE_POINT 0xC0000000 | 0x0275
#define NT_STATUS_NO_SUCH_JOB 0xC0000000 | 0xEDE /* scheduler */
#endif /* _NTERR_H */
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index d88b41133e00..2b9e9885dc42 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -14,6 +14,20 @@
#include "fs_context.h"
#include "reparse.h"
+static int mknod_nfs(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev,
+ const char *symname);
+
+static int mknod_wsl(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev,
+ const char *symname);
+
+static int create_native_symlink(const unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, const char *symname);
+
static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
const unsigned int xid,
const char *full_path,
@@ -24,37 +38,148 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, const char *symname)
{
+ switch (get_cifs_symlink_type(CIFS_SB(inode->i_sb))) {
+ case CIFS_SYMLINK_TYPE_NATIVE:
+ return create_native_symlink(xid, inode, dentry, tcon, full_path, symname);
+ case CIFS_SYMLINK_TYPE_NFS:
+ return mknod_nfs(xid, inode, dentry, tcon, full_path, S_IFLNK, 0, symname);
+ case CIFS_SYMLINK_TYPE_WSL:
+ return mknod_wsl(xid, inode, dentry, tcon, full_path, S_IFLNK, 0, symname);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int create_native_symlink(const unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, const char *symname)
+{
struct reparse_symlink_data_buffer *buf = NULL;
- struct cifs_open_info_data data;
+ struct cifs_open_info_data data = {};
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct inode *new;
struct kvec iov;
- __le16 *path;
+ __le16 *path = NULL;
bool directory;
- char *sym, sep = CIFS_DIR_SEP(cifs_sb);
- u16 len, plen;
+ char *symlink_target = NULL;
+ char *sym = NULL;
+ char sep = CIFS_DIR_SEP(cifs_sb);
+ u16 len, plen, poff, slen;
int rc = 0;
if (strlen(symname) > REPARSE_SYM_PATH_MAX)
return -ENAMETOOLONG;
- sym = kstrdup(symname, GFP_KERNEL);
- if (!sym)
- return -ENOMEM;
+ symlink_target = kstrdup(symname, GFP_KERNEL);
+ if (!symlink_target) {
+ rc = -ENOMEM;
+ goto out;
+ }
data = (struct cifs_open_info_data) {
.reparse_point = true,
.reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
- .symlink_target = sym,
+ .symlink_target = symlink_target,
};
- convert_delimiter(sym, sep);
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') {
+ /*
+ * This is a request to create an absolute symlink on the server
+ * which does not support POSIX paths, and expects symlink in
+ * NT-style path. So convert absolute Linux symlink target path
+ * to the absolute NT-style path. Root of the NT-style path for
+ * symlinks is specified in "symlinkroot" mount option. This will
+ * ensure compatibility of this symlink stored in absolute form
+ * on the SMB server.
+ */
+ if (!strstarts(symname, cifs_sb->ctx->symlinkroot)) {
+ /*
+ * If the absolute Linux symlink target path is not
+ * inside "symlinkroot" location then there is no way
+ * to convert such Linux symlink to NT-style path.
+ */
+ cifs_dbg(VFS,
+ "absolute symlink '%s' cannot be converted to NT format "
+ "because it is outside of symlinkroot='%s'\n",
+ symname, cifs_sb->ctx->symlinkroot);
+ rc = -EINVAL;
+ goto out;
+ }
+ len = strlen(cifs_sb->ctx->symlinkroot);
+ if (cifs_sb->ctx->symlinkroot[len-1] != '/')
+ len++;
+ if (symname[len] >= 'a' && symname[len] <= 'z' &&
+ (symname[len+1] == '/' || symname[len+1] == '\0')) {
+ /*
+ * Symlink points to Linux target /symlinkroot/x/path/...
+ * where 'x' is the lowercase local Windows drive.
+ * NT-style path for 'x' has common form \??\X:\path\...
+ * with uppercase local Windows drive.
+ */
+ int common_path_len = strlen(symname+len+1)+1;
+ sym = kzalloc(6+common_path_len, GFP_KERNEL);
+ if (!sym) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ memcpy(sym, "\\??\\", 4);
+ sym[4] = symname[len] - ('a'-'A');
+ sym[5] = ':';
+ memcpy(sym+6, symname+len+1, common_path_len);
+ } else {
+ /* Unhandled absolute symlink. Report an error. */
+ cifs_dbg(
+ VFS,
+ "absolute symlink '%s' cannot be converted to NT format "
+ "because it points to unknown target\n",
+ symname);
+ rc = -EINVAL;
+ goto out;
+ }
+ } else {
+ /*
+ * This is request to either create an absolute symlink on
+ * server which expects POSIX paths or it is an request to
+ * create a relative symlink from the current directory.
+ * These paths have same format as relative SMB symlinks,
+ * so no conversion is needed. So just take symname as-is.
+ */
+ sym = kstrdup(symname, GFP_KERNEL);
+ if (!sym) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (sep == '\\')
+ convert_delimiter(sym, sep);
+
+ /*
+ * For absolute NT symlinks it is required to pass also leading
+ * backslash and to not mangle NT object prefix "\\??\\" and not to
+ * mangle colon in drive letter. But cifs_convert_path_to_utf16()
+ * removes leading backslash and replaces '?' and ':'. So temporary
+ * mask these characters in NT object prefix by '_' and then change
+ * them back.
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/')
+ sym[0] = sym[1] = sym[2] = sym[5] = '_';
+
path = cifs_convert_path_to_utf16(sym, cifs_sb);
if (!path) {
rc = -ENOMEM;
goto out;
}
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') {
+ sym[0] = '\\';
+ sym[1] = sym[2] = '?';
+ sym[5] = ':';
+ path[0] = cpu_to_le16('\\');
+ path[1] = path[2] = cpu_to_le16('?');
+ path[5] = cpu_to_le16(':');
+ }
+
/*
* SMB distinguish between symlink to directory and symlink to file.
* They cannot be exchanged (symlink of file type which points to
@@ -67,8 +192,18 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
if (rc < 0)
goto out;
- plen = 2 * UniStrnlen((wchar_t *)path, REPARSE_SYM_PATH_MAX);
- len = sizeof(*buf) + plen * 2;
+ slen = 2 * UniStrnlen((wchar_t *)path, REPARSE_SYM_PATH_MAX);
+ poff = 0;
+ plen = slen;
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') {
+ /*
+ * For absolute NT symlinks skip leading "\\??\\" in PrintName as
+ * PrintName is user visible location in DOS/Win32 format (not in NT format).
+ */
+ poff = 4;
+ plen -= 2 * poff;
+ }
+ len = sizeof(*buf) + plen + slen;
buf = kzalloc(len, GFP_KERNEL);
if (!buf) {
rc = -ENOMEM;
@@ -77,17 +212,17 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
+
buf->SubstituteNameOffset = cpu_to_le16(plen);
- buf->SubstituteNameLength = cpu_to_le16(plen);
- memcpy(&buf->PathBuffer[plen], path, plen);
+ buf->SubstituteNameLength = cpu_to_le16(slen);
+ memcpy(&buf->PathBuffer[plen], path, slen);
+
buf->PrintNameOffset = 0;
buf->PrintNameLength = cpu_to_le16(plen);
- memcpy(buf->PathBuffer, path, plen);
+ memcpy(buf->PathBuffer, path+poff, plen);
+
buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
- if (*sym != sep)
- buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
- convert_delimiter(sym, '/');
iov.iov_base = buf;
iov.iov_len = len;
new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
@@ -98,6 +233,7 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
else
rc = PTR_ERR(new);
out:
+ kfree(sym);
kfree(path);
cifs_free_open_info(&data);
kfree(buf);
@@ -242,8 +378,39 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
return 0;
}
-static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
+static int create_native_socket(const unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path)
+{
+ struct reparse_data_buffer buf = {
+ .ReparseTag = cpu_to_le32(IO_REPARSE_TAG_AF_UNIX),
+ .ReparseDataLength = cpu_to_le16(0),
+ };
+ struct cifs_open_info_data data = {
+ .reparse_point = true,
+ .reparse = { .tag = IO_REPARSE_TAG_AF_UNIX, .buf = &buf, },
+ };
+ struct kvec iov = {
+ .iov_base = &buf,
+ .iov_len = sizeof(buf),
+ };
+ struct inode *new;
+ int rc = 0;
+
+ new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ tcon, full_path, false, &iov, NULL);
+ if (!IS_ERR(new))
+ d_instantiate(dentry, new);
+ else
+ rc = PTR_ERR(new);
+ cifs_free_open_info(&data);
+ return rc;
+}
+
+static int nfs_set_reparse_buf(struct reparse_nfs_data_buffer *buf,
mode_t mode, dev_t dev,
+ __le16 *symname_utf16,
+ int symname_utf16_len,
struct kvec *iov)
{
u64 type;
@@ -254,7 +421,13 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
switch ((type = reparse_mode_nfs_type(mode))) {
case NFS_SPECFILE_BLK:
case NFS_SPECFILE_CHR:
- dlen = sizeof(__le64);
+ dlen = 2 * sizeof(__le32);
+ ((__le32 *)buf->DataBuffer)[0] = cpu_to_le32(MAJOR(dev));
+ ((__le32 *)buf->DataBuffer)[1] = cpu_to_le32(MINOR(dev));
+ break;
+ case NFS_SPECFILE_LNK:
+ dlen = symname_utf16_len;
+ memcpy(buf->DataBuffer, symname_utf16, symname_utf16_len);
break;
case NFS_SPECFILE_FIFO:
case NFS_SPECFILE_SOCK:
@@ -269,8 +442,6 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
buf->InodeType = cpu_to_le64(type);
buf->ReparseDataLength = cpu_to_le16(len + dlen -
sizeof(struct reparse_data_buffer));
- *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MINOR(dev) << 32) |
- MAJOR(dev));
iov->iov_base = buf;
iov->iov_len = len + dlen;
return 0;
@@ -278,23 +449,45 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
static int mknod_nfs(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, umode_t mode, dev_t dev)
+ const char *full_path, umode_t mode, dev_t dev,
+ const char *symname)
{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_open_info_data data;
- struct reparse_posix_data *p;
+ struct reparse_nfs_data_buffer *p = NULL;
+ __le16 *symname_utf16 = NULL;
+ int symname_utf16_len = 0;
struct inode *new;
struct kvec iov;
__u8 buf[sizeof(*p) + sizeof(__le64)];
int rc;
- p = (struct reparse_posix_data *)buf;
- rc = nfs_set_reparse_buf(p, mode, dev, &iov);
+ if (S_ISLNK(mode)) {
+ symname_utf16 = cifs_strndup_to_utf16(symname, strlen(symname),
+ &symname_utf16_len,
+ cifs_sb->local_nls,
+ NO_MAP_UNI_RSVD);
+ if (!symname_utf16) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ symname_utf16_len -= 2; /* symlink is without trailing wide-nul */
+ p = kzalloc(sizeof(*p) + symname_utf16_len, GFP_KERNEL);
+ if (!p) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ } else {
+ p = (struct reparse_nfs_data_buffer *)buf;
+ }
+ rc = nfs_set_reparse_buf(p, mode, dev, symname_utf16, symname_utf16_len, &iov);
if (rc)
- return rc;
+ goto out;
data = (struct cifs_open_info_data) {
.reparse_point = true,
- .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
+ .reparse = { .tag = IO_REPARSE_TAG_NFS, .buf = (struct reparse_data_buffer *)p, },
+ .symlink_target = kstrdup(symname, GFP_KERNEL),
};
new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
@@ -304,12 +497,25 @@ static int mknod_nfs(unsigned int xid, struct inode *inode,
else
rc = PTR_ERR(new);
cifs_free_open_info(&data);
+out:
+ if (S_ISLNK(mode)) {
+ kfree(symname_utf16);
+ kfree(p);
+ }
return rc;
}
-static int wsl_set_reparse_buf(struct reparse_data_buffer *buf,
- mode_t mode, struct kvec *iov)
+static int wsl_set_reparse_buf(struct reparse_data_buffer **buf,
+ mode_t mode, const char *symname,
+ struct cifs_sb_info *cifs_sb,
+ struct kvec *iov)
{
+ struct reparse_wsl_symlink_data_buffer *symlink_buf;
+ __le16 *symname_utf16;
+ int symname_utf16_len;
+ int symname_utf8_maxlen;
+ int symname_utf8_len;
+ size_t buf_len;
u32 tag;
switch ((tag = reparse_mode_wsl_tag(mode))) {
@@ -317,16 +523,45 @@ static int wsl_set_reparse_buf(struct reparse_data_buffer *buf,
case IO_REPARSE_TAG_LX_CHR:
case IO_REPARSE_TAG_LX_FIFO:
case IO_REPARSE_TAG_AF_UNIX:
+ buf_len = sizeof(struct reparse_data_buffer);
+ *buf = kzalloc(buf_len, GFP_KERNEL);
+ if (!*buf)
+ return -ENOMEM;
+ break;
+ case IO_REPARSE_TAG_LX_SYMLINK:
+ symname_utf16 = cifs_strndup_to_utf16(symname, strlen(symname),
+ &symname_utf16_len,
+ cifs_sb->local_nls,
+ NO_MAP_UNI_RSVD);
+ if (!symname_utf16)
+ return -ENOMEM;
+ symname_utf8_maxlen = symname_utf16_len/2*3;
+ symlink_buf = kzalloc(sizeof(struct reparse_wsl_symlink_data_buffer) +
+ symname_utf8_maxlen, GFP_KERNEL);
+ if (!symlink_buf) {
+ kfree(symname_utf16);
+ return -ENOMEM;
+ }
+ /* Flag 0x02000000 is unknown, but all wsl symlinks have this value */
+ symlink_buf->Flags = cpu_to_le32(0x02000000);
+ /* PathBuffer is in UTF-8 but without trailing null-term byte */
+ symname_utf8_len = utf16s_to_utf8s((wchar_t *)symname_utf16, symname_utf16_len/2,
+ UTF16_LITTLE_ENDIAN,
+ symlink_buf->PathBuffer,
+ symname_utf8_maxlen);
+ *buf = (struct reparse_data_buffer *)symlink_buf;
+ buf_len = sizeof(struct reparse_wsl_symlink_data_buffer) + symname_utf8_len;
+ kfree(symname_utf16);
break;
default:
return -EOPNOTSUPP;
}
- buf->ReparseTag = cpu_to_le32(tag);
- buf->Reserved = 0;
- buf->ReparseDataLength = 0;
- iov->iov_base = buf;
- iov->iov_len = sizeof(*buf);
+ (*buf)->ReparseTag = cpu_to_le32(tag);
+ (*buf)->Reserved = 0;
+ (*buf)->ReparseDataLength = cpu_to_le16(buf_len - sizeof(struct reparse_data_buffer));
+ iov->iov_base = *buf;
+ iov->iov_len = buf_len;
return 0;
}
@@ -415,27 +650,32 @@ static int wsl_set_xattrs(struct inode *inode, umode_t _mode,
static int mknod_wsl(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, umode_t mode, dev_t dev)
+ const char *full_path, umode_t mode, dev_t dev,
+ const char *symname)
{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_open_info_data data;
- struct reparse_data_buffer buf;
+ struct reparse_data_buffer *buf;
struct smb2_create_ea_ctx *cc;
struct inode *new;
unsigned int len;
struct kvec reparse_iov, xattr_iov;
int rc;
- rc = wsl_set_reparse_buf(&buf, mode, &reparse_iov);
+ rc = wsl_set_reparse_buf(&buf, mode, symname, cifs_sb, &reparse_iov);
if (rc)
return rc;
rc = wsl_set_xattrs(inode, mode, dev, &xattr_iov);
- if (rc)
+ if (rc) {
+ kfree(buf);
return rc;
+ }
data = (struct cifs_open_info_data) {
.reparse_point = true,
- .reparse = { .tag = le32_to_cpu(buf.ReparseTag), .buf = &buf, },
+ .reparse = { .tag = le32_to_cpu(buf->ReparseTag), .buf = buf, },
+ .symlink_target = kstrdup(symname, GFP_KERNEL),
};
cc = xattr_iov.iov_base;
@@ -452,6 +692,7 @@ static int mknod_wsl(unsigned int xid, struct inode *inode,
rc = PTR_ERR(new);
cifs_free_open_info(&data);
kfree(xattr_iov.iov_base);
+ kfree(buf);
return rc;
}
@@ -460,21 +701,22 @@ int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
const char *full_path, umode_t mode, dev_t dev)
{
struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx;
- int rc = -EOPNOTSUPP;
+
+ if (S_ISSOCK(mode) && !ctx->nonativesocket && ctx->reparse_type != CIFS_REPARSE_TYPE_NONE)
+ return create_native_socket(xid, inode, dentry, tcon, full_path);
switch (ctx->reparse_type) {
case CIFS_REPARSE_TYPE_NFS:
- rc = mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev);
- break;
+ return mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev, NULL);
case CIFS_REPARSE_TYPE_WSL:
- rc = mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev);
- break;
+ return mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev, NULL);
+ default:
+ return -EOPNOTSUPP;
}
- return rc;
}
/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
-static int parse_reparse_posix(struct reparse_posix_data *buf,
+static int parse_reparse_nfs(struct reparse_nfs_data_buffer *buf,
struct cifs_sb_info *cifs_sb,
struct cifs_open_info_data *data)
{
@@ -536,43 +778,160 @@ static int parse_reparse_posix(struct reparse_posix_data *buf,
}
int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
- bool unicode, bool relative,
+ bool relative,
const char *full_path,
struct cifs_sb_info *cifs_sb)
{
char sep = CIFS_DIR_SEP(cifs_sb);
char *linux_target = NULL;
char *smb_target = NULL;
+ int symlinkroot_len;
+ int abs_path_len;
+ char *abs_path;
int levels;
int rc;
int i;
- /* Check that length it valid for unicode/non-unicode mode */
- if (!len || (unicode && (len % 2))) {
+ /* Check that length it valid */
+ if (!len || (len % 2)) {
cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
rc = -EIO;
goto out;
}
/*
- * Check that buffer does not contain UTF-16 null codepoint in unicode
- * mode or null byte in non-unicode mode because Linux cannot process
- * symlink with null byte.
+ * Check that buffer does not contain UTF-16 null codepoint
+ * because Linux cannot process symlink with null byte.
*/
- if ((unicode && UniStrnlen((wchar_t *)buf, len/2) != len/2) ||
- (!unicode && strnlen(buf, len) != len)) {
+ if (UniStrnlen((wchar_t *)buf, len/2) != len/2) {
cifs_dbg(VFS, "srv returned null byte in native symlink target location\n");
rc = -EIO;
goto out;
}
- smb_target = cifs_strndup_from_utf16(buf, len, unicode, cifs_sb->local_nls);
+ smb_target = cifs_strndup_from_utf16(buf, len, true, cifs_sb->local_nls);
if (!smb_target) {
rc = -ENOMEM;
goto out;
}
- if (smb_target[0] == sep && relative) {
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && !relative) {
+ /*
+ * This is an absolute symlink from the server which does not
+ * support POSIX paths, so the symlink is in NT-style path.
+ * So convert it to absolute Linux symlink target path. Root of
+ * the NT-style path for symlinks is specified in "symlinkroot"
+ * mount option.
+ *
+ * Root of the DOS and Win32 paths is at NT path \??\
+ * It means that DOS/Win32 path C:\folder\file.txt is
+ * NT path \??\C:\folder\file.txt
+ *
+ * NT systems have some well-known object symlinks in their NT
+ * hierarchy, which is needed to take into account when resolving
+ * other symlinks. Most commonly used symlink paths are:
+ * \?? -> \GLOBAL??
+ * \DosDevices -> \??
+ * \GLOBAL??\GLOBALROOT -> \
+ * \GLOBAL??\Global -> \GLOBAL??
+ * \GLOBAL??\NUL -> \Device\Null
+ * \GLOBAL??\UNC -> \Device\Mup
+ * \GLOBAL??\PhysicalDrive0 -> \Device\Harddisk0\DR0 (for each harddisk)
+ * \GLOBAL??\A: -> \Device\Floppy0 (if A: is the first floppy)
+ * \GLOBAL??\C: -> \Device\HarddiskVolume1 (if C: is the first harddisk)
+ * \GLOBAL??\D: -> \Device\CdRom0 (if D: is first cdrom)
+ * \SystemRoot -> \Device\Harddisk0\Partition1\WINDOWS (or where is NT system installed)
+ * \Volume{...} -> \Device\HarddiskVolume1 (where ... is system generated guid)
+ *
+ * In most common cases, absolute NT symlinks points to path on
+ * DOS/Win32 drive letter, system-specific Volume or on UNC share.
+ * Here are few examples of commonly used absolute NT symlinks
+ * created by mklink.exe tool:
+ * \??\C:\folder\file.txt
+ * \??\\C:\folder\file.txt
+ * \??\UNC\server\share\file.txt
+ * \??\\UNC\server\share\file.txt
+ * \??\Volume{b75e2c83-0000-0000-0000-602f00000000}\folder\file.txt
+ *
+ * It means that the most common path prefix \??\ is also NT path
+ * symlink (to \GLOBAL??). It is less common that second path
+ * separator is double backslash, but it is valid.
+ *
+ * Volume guid is randomly generated by the target system and so
+ * only the target system knows the mapping between guid and the
+ * hardisk number. Over SMB it is not possible to resolve this
+ * mapping, therefore symlinks pointing to target location of
+ * volume guids are totally unusable over SMB.
+ *
+ * For now parse only symlink paths available for DOS and Win32.
+ * Those are paths with \??\ prefix or paths which points to \??\
+ * via other NT symlink (\DosDevices\, \GLOBAL??\, ...).
+ */
+ abs_path = smb_target;
+globalroot:
+ if (strstarts(abs_path, "\\??\\"))
+ abs_path += sizeof("\\??\\")-1;
+ else if (strstarts(abs_path, "\\DosDevices\\"))
+ abs_path += sizeof("\\DosDevices\\")-1;
+ else if (strstarts(abs_path, "\\GLOBAL??\\"))
+ abs_path += sizeof("\\GLOBAL??\\")-1;
+ else {
+ /* Unhandled absolute symlink, points outside of DOS/Win32 */
+ cifs_dbg(VFS,
+ "absolute symlink '%s' cannot be converted from NT format "
+ "because points to unknown target\n",
+ smb_target);
+ rc = -EIO;
+ goto out;
+ }
+
+ /* Sometimes path separator after \?? is double backslash */
+ if (abs_path[0] == '\\')
+ abs_path++;
+
+ while (strstarts(abs_path, "Global\\"))
+ abs_path += sizeof("Global\\")-1;
+
+ if (strstarts(abs_path, "GLOBALROOT\\")) {
+ /* Label globalroot requires path with leading '\\', so do not trim '\\' */
+ abs_path += sizeof("GLOBALROOT")-1;
+ goto globalroot;
+ }
+
+ /* For now parse only paths to drive letters */
+ if (((abs_path[0] >= 'A' && abs_path[0] <= 'Z') ||
+ (abs_path[0] >= 'a' && abs_path[0] <= 'z')) &&
+ abs_path[1] == ':' &&
+ (abs_path[2] == '\\' || abs_path[2] == '\0')) {
+ /* Convert drive letter to lowercase and drop colon */
+ char drive_letter = abs_path[0];
+ if (drive_letter >= 'A' && drive_letter <= 'Z')
+ drive_letter += 'a'-'A';
+ abs_path++;
+ abs_path[0] = drive_letter;
+ } else {
+ /* Unhandled absolute symlink. Report an error. */
+ cifs_dbg(VFS,
+ "absolute symlink '%s' cannot be converted from NT format "
+ "because points to unknown target\n",
+ smb_target);
+ rc = -EIO;
+ goto out;
+ }
+
+ abs_path_len = strlen(abs_path)+1;
+ symlinkroot_len = strlen(cifs_sb->ctx->symlinkroot);
+ if (cifs_sb->ctx->symlinkroot[symlinkroot_len-1] == '/')
+ symlinkroot_len--;
+ linux_target = kmalloc(symlinkroot_len + 1 + abs_path_len, GFP_KERNEL);
+ if (!linux_target) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ memcpy(linux_target, cifs_sb->ctx->symlinkroot, symlinkroot_len);
+ linux_target[symlinkroot_len] = '/';
+ memcpy(linux_target + symlinkroot_len + 1, abs_path, abs_path_len);
+ } else if (smb_target[0] == sep && relative) {
/*
* This is a relative SMB symlink from the top of the share,
* which is the top level directory of the Linux mount point.
@@ -601,6 +960,12 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
}
memcpy(linux_target + levels*3, smb_target+1, smb_target_len); /* +1 to skip leading sep */
} else {
+ /*
+ * This is either an absolute symlink in POSIX-style format
+ * or relative SMB symlink from the current directory.
+ * These paths have same format as Linux symlinks, so no
+ * conversion is needed.
+ */
linux_target = smb_target;
smb_target = NULL;
}
@@ -620,8 +985,8 @@ out:
return rc;
}
-static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
- u32 plen, bool unicode,
+static int parse_reparse_native_symlink(struct reparse_symlink_data_buffer *sym,
+ u32 plen,
struct cifs_sb_info *cifs_sb,
const char *full_path,
struct cifs_open_info_data *data)
@@ -641,7 +1006,6 @@ static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
return smb2_parse_native_symlink(&data->symlink_target,
sym->PathBuffer + offs,
len,
- unicode,
le32_to_cpu(sym->Flags) & SYMLINK_FLAG_RELATIVE,
full_path,
cifs_sb);
@@ -696,7 +1060,7 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf
int parse_reparse_point(struct reparse_data_buffer *buf,
u32 plen, struct cifs_sb_info *cifs_sb,
const char *full_path,
- bool unicode, struct cifs_open_info_data *data)
+ struct cifs_open_info_data *data)
{
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
@@ -705,12 +1069,12 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
/* See MS-FSCC 2.1.2 */
switch (le32_to_cpu(buf->ReparseTag)) {
case IO_REPARSE_TAG_NFS:
- return parse_reparse_posix((struct reparse_posix_data *)buf,
+ return parse_reparse_nfs((struct reparse_nfs_data_buffer *)buf,
cifs_sb, data);
case IO_REPARSE_TAG_SYMLINK:
- return parse_reparse_symlink(
+ return parse_reparse_native_symlink(
(struct reparse_symlink_data_buffer *)buf,
- plen, unicode, cifs_sb, full_path, data);
+ plen, cifs_sb, full_path, data);
case IO_REPARSE_TAG_LX_SYMLINK:
return parse_reparse_wsl_symlink(
(struct reparse_wsl_symlink_data_buffer *)buf,
@@ -724,13 +1088,12 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
le32_to_cpu(buf->ReparseTag));
return -EIO;
}
- break;
+ return 0;
default:
cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n",
le32_to_cpu(buf->ReparseTag));
- break;
+ return -EOPNOTSUPP;
}
- return 0;
}
int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
@@ -744,14 +1107,15 @@ int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
buf = (struct reparse_data_buffer *)((u8 *)io +
le32_to_cpu(io->OutputOffset));
- return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data);
+ return parse_reparse_point(buf, plen, cifs_sb, full_path, data);
}
-static void wsl_to_fattr(struct cifs_open_info_data *data,
+static bool wsl_to_fattr(struct cifs_open_info_data *data,
struct cifs_sb_info *cifs_sb,
u32 tag, struct cifs_fattr *fattr)
{
struct smb2_file_full_ea_info *ea;
+ bool have_xattr_dev = false;
u32 next = 0;
switch (tag) {
@@ -794,21 +1158,31 @@ static void wsl_to_fattr(struct cifs_open_info_data *data,
fattr->cf_uid = wsl_make_kuid(cifs_sb, v);
else if (!strncmp(name, SMB2_WSL_XATTR_GID, nlen))
fattr->cf_gid = wsl_make_kgid(cifs_sb, v);
- else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen))
+ else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen)) {
+ /* File type in reparse point tag and in xattr mode must match. */
+ if (S_DT(fattr->cf_mode) != S_DT(le32_to_cpu(*(__le32 *)v)))
+ return false;
fattr->cf_mode = (umode_t)le32_to_cpu(*(__le32 *)v);
- else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen))
+ } else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen)) {
fattr->cf_rdev = reparse_mkdev(v);
+ have_xattr_dev = true;
+ }
} while (next);
out:
+
+ /* Major and minor numbers for char and block devices are mandatory. */
+ if (!have_xattr_dev && (tag == IO_REPARSE_TAG_LX_CHR || tag == IO_REPARSE_TAG_LX_BLK))
+ return false;
+
fattr->cf_dtype = S_DT(fattr->cf_mode);
+ return true;
}
static bool posix_reparse_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr,
struct cifs_open_info_data *data)
{
- struct reparse_posix_data *buf = data->reparse.posix;
-
+ struct reparse_nfs_data_buffer *buf = (struct reparse_nfs_data_buffer *)data->reparse.buf;
if (buf == NULL)
return true;
@@ -874,7 +1248,9 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
case IO_REPARSE_TAG_AF_UNIX:
case IO_REPARSE_TAG_LX_CHR:
case IO_REPARSE_TAG_LX_BLK:
- wsl_to_fattr(data, cifs_sb, tag, fattr);
+ ok = wsl_to_fattr(data, cifs_sb, tag, fattr);
+ if (!ok)
+ return false;
break;
case IO_REPARSE_TAG_NFS:
ok = posix_reparse_to_fattr(cifs_sb, fattr, data);
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
index ff05b0e75c92..c0be5ab45a78 100644
--- a/fs/smb/client/reparse.h
+++ b/fs/smb/client/reparse.h
@@ -50,6 +50,7 @@ static inline kgid_t wsl_make_kgid(struct cifs_sb_info *cifs_sb,
static inline u64 reparse_mode_nfs_type(mode_t mode)
{
switch (mode & S_IFMT) {
+ case S_IFLNK: return NFS_SPECFILE_LNK;
case S_IFBLK: return NFS_SPECFILE_BLK;
case S_IFCHR: return NFS_SPECFILE_CHR;
case S_IFIFO: return NFS_SPECFILE_FIFO;
@@ -61,6 +62,7 @@ static inline u64 reparse_mode_nfs_type(mode_t mode)
static inline u32 reparse_mode_wsl_tag(mode_t mode)
{
switch (mode & S_IFMT) {
+ case S_IFLNK: return IO_REPARSE_TAG_LX_SYMLINK;
case S_IFBLK: return IO_REPARSE_TAG_LX_BLK;
case S_IFCHR: return IO_REPARSE_TAG_LX_CHR;
case S_IFIFO: return IO_REPARSE_TAG_LX_FIFO;
@@ -97,14 +99,30 @@ static inline bool reparse_inode_match(struct inode *inode,
static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
{
- struct smb2_file_all_info *fi = &data->fi;
- u32 attrs = le32_to_cpu(fi->Attributes);
+ u32 attrs;
bool ret;
- ret = data->reparse_point || (attrs & ATTR_REPARSE);
- if (ret)
- attrs |= ATTR_REPARSE;
- fi->Attributes = cpu_to_le32(attrs);
+ if (data->contains_posix_file_info) {
+ struct smb311_posix_qinfo *fi = &data->posix_fi;
+
+ attrs = le32_to_cpu(fi->DosAttributes);
+ if (data->reparse_point) {
+ attrs |= ATTR_REPARSE;
+ fi->DosAttributes = cpu_to_le32(attrs);
+ }
+
+ } else {
+ struct smb2_file_all_info *fi = &data->fi;
+
+ attrs = le32_to_cpu(fi->Attributes);
+ if (data->reparse_point) {
+ attrs |= ATTR_REPARSE;
+ fi->Attributes = cpu_to_le32(attrs);
+ }
+ }
+
+ ret = attrs & ATTR_REPARSE;
+
return ret;
}
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 91d4d409cb1d..faa80e7d54a6 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -1235,12 +1235,13 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
switch (requested) {
case Kerberos:
case RawNTLMSSP:
+ case IAKerb:
return requested;
case Unspecified:
if (server->sec_ntlmssp &&
(global_secflags & CIFSSEC_MAY_NTLMSSP))
return RawNTLMSSP;
- if ((server->sec_kerberos || server->sec_mskerberos) &&
+ if ((server->sec_kerberos || server->sec_mskerberos || server->sec_iakerb) &&
(global_secflags & CIFSSEC_MAY_KRB5))
return Kerberos;
fallthrough;
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 749a83cd0deb..d6e2fb669c40 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -377,7 +377,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
static void
cifs_downgrade_oplock(struct TCP_Server_Info *server,
struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
cifs_set_oplock_level(cinode, oplock);
}
@@ -551,7 +551,7 @@ static int cifs_query_path_info(const unsigned int xid,
int rc;
FILE_ALL_INFO fi = {};
- data->symlink = false;
+ data->reparse_point = false;
data->adjust_tz = false;
/* could do find first instead but this returns more info */
@@ -569,32 +569,8 @@ static int cifs_query_path_info(const unsigned int xid,
}
if (!rc) {
- int tmprc;
- int oplock = 0;
- struct cifs_fid fid;
- struct cifs_open_parms oparms;
-
move_cifs_info_to_smb2(&data->fi, &fi);
-
- if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE))
- return 0;
-
- oparms = (struct cifs_open_parms) {
- .tcon = tcon,
- .cifs_sb = cifs_sb,
- .desired_access = FILE_READ_ATTRIBUTES,
- .create_options = cifs_create_options(cifs_sb, 0),
- .disposition = FILE_OPEN,
- .path = full_path,
- .fid = &fid,
- };
-
- /* Need to check if this is a symbolic link or not */
- tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
- if (tmprc == -EOPNOTSUPP)
- data->symlink = true;
- else if (tmprc == 0)
- CIFSSMBClose(xid, tcon, fid.netfid);
+ data->reparse_point = le32_to_cpu(fi.Attributes) & ATTR_REPARSE;
}
return rc;
@@ -1010,7 +986,7 @@ static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb,
buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol +
le32_to_cpu(io->DataOffset));
- return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data);
+ return parse_reparse_point(buf, plen, cifs_sb, full_path, data);
}
static bool
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index 9ec44eab8dbc..d609a20fb98a 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -63,6 +63,52 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov)
return sym;
}
+int smb2_fix_symlink_target_type(char **target, bool directory, struct cifs_sb_info *cifs_sb)
+{
+ char *buf;
+ int len;
+
+ /*
+ * POSIX server does not distinguish between symlinks to file and
+ * symlink directory. So nothing is needed to fix on the client side.
+ */
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+ return 0;
+
+ if (!*target)
+ return -EIO;
+
+ len = strlen(*target);
+ if (!len)
+ return -EIO;
+
+ /*
+ * If this is directory symlink and it does not have trailing slash then
+ * append it. Trailing slash simulates Windows/SMB behavior which do not
+ * allow resolving directory symlink to file.
+ */
+ if (directory && (*target)[len-1] != '/') {
+ buf = krealloc(*target, len+2, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ buf[len] = '/';
+ buf[len+1] = '\0';
+ *target = buf;
+ len++;
+ }
+
+ /*
+ * If this is a file (non-directory) symlink and it points to path name
+ * with trailing slash then this is an invalid symlink because file name
+ * cannot contain slash character. File name with slash is invalid on
+ * both Windows and Linux systems. So return an error for such symlink.
+ */
+ if (!directory && (*target)[len-1] == '/')
+ return -EIO;
+
+ return 0;
+}
+
int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov,
const char *full_path, char **path)
{
@@ -89,7 +135,6 @@ int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec
return smb2_parse_native_symlink(path,
(char *)sym->PathBuffer + sub_offs,
sub_len,
- true,
le32_to_cpu(sym->Flags) & SYMLINK_FLAG_RELATIVE,
full_path,
cifs_sb);
@@ -133,6 +178,11 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
NULL, NULL, NULL);
oparms->create_options &= ~OPEN_REPARSE_POINT;
}
+ if (!rc) {
+ bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY;
+ rc = smb2_fix_symlink_target_type(&data->symlink_target,
+ directory, oparms->cifs_sb);
+ }
}
}
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index c97f14757c27..826b57a5a2a8 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -650,6 +650,7 @@ finished:
switch (cmds[i]) {
case SMB2_OP_QUERY_INFO:
idata = in_iov[i].iov_base;
+ idata->contains_posix_file_info = false;
if (rc == 0 && cfile && cfile->symlink_target) {
idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
if (!idata->symlink_target)
@@ -673,6 +674,7 @@ finished:
break;
case SMB2_OP_POSIX_QUERY_INFO:
idata = in_iov[i].iov_base;
+ idata->contains_posix_file_info = true;
if (rc == 0 && cfile && cfile->symlink_target) {
idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
if (!idata->symlink_target)
@@ -770,6 +772,7 @@ finished:
idata = in_iov[i].iov_base;
idata->reparse.io.iov = *iov;
idata->reparse.io.buftype = resp_buftype[i + 1];
+ idata->contains_posix_file_info = false; /* BB VERIFY */
rbuf = reparse_buf_ptr(iov);
if (IS_ERR(rbuf)) {
rc = PTR_ERR(rbuf);
@@ -791,6 +794,7 @@ finished:
case SMB2_OP_QUERY_WSL_EA:
if (!rc) {
idata = in_iov[i].iov_base;
+ idata->contains_posix_file_info = false;
qi_rsp = rsp_iov[i + 1].iov_base;
data[0] = (u8 *)qi_rsp + le16_to_cpu(qi_rsp->OutputBufferOffset);
size[0] = le32_to_cpu(qi_rsp->OutputBufferLength);
@@ -1010,6 +1014,11 @@ int smb2_query_path_info(const unsigned int xid,
else
rc = -EOPNOTSUPP;
}
+
+ if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) {
+ bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY;
+ rc = smb2_fix_symlink_target_type(&data->symlink_target, directory, cifs_sb);
+ }
break;
case -EREMOTE:
break;
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index b05313acf9b2..12c2b868789f 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -380,7 +380,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_NO_LOGON_SERVERS, -EIO, "STATUS_NO_LOGON_SERVERS"},
{STATUS_NO_SUCH_LOGON_SESSION, -EIO, "STATUS_NO_SUCH_LOGON_SESSION"},
{STATUS_NO_SUCH_PRIVILEGE, -EIO, "STATUS_NO_SUCH_PRIVILEGE"},
- {STATUS_PRIVILEGE_NOT_HELD, -EIO, "STATUS_PRIVILEGE_NOT_HELD"},
+ {STATUS_PRIVILEGE_NOT_HELD, -EPERM, "STATUS_PRIVILEGE_NOT_HELD"},
{STATUS_INVALID_ACCOUNT_NAME, -EIO, "STATUS_INVALID_ACCOUNT_NAME"},
{STATUS_USER_EXISTS, -EIO, "STATUS_USER_EXISTS"},
{STATUS_NO_SUCH_USER, -EIO, "STATUS_NO_SUCH_USER"},
@@ -871,7 +871,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_VALIDATE_CONTINUE, -EIO, "STATUS_VALIDATE_CONTINUE"},
{STATUS_NO_MATCH, -EIO, "STATUS_NO_MATCH"},
{STATUS_NO_MORE_MATCHES, -EIO, "STATUS_NO_MORE_MATCHES"},
- {STATUS_NOT_A_REPARSE_POINT, -EIO, "STATUS_NOT_A_REPARSE_POINT"},
+ {STATUS_NOT_A_REPARSE_POINT, -ENODATA, "STATUS_NOT_A_REPARSE_POINT"},
{STATUS_IO_REPARSE_TAG_INVALID, -EIO, "STATUS_IO_REPARSE_TAG_INVALID"},
{STATUS_IO_REPARSE_TAG_MISMATCH, -EIO,
"STATUS_IO_REPARSE_TAG_MISMATCH"},
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index d640dcabc305..23e0c8be7fb5 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -1001,6 +1001,7 @@ static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
if (!data->symlink_target)
return -ENOMEM;
}
+ data->contains_posix_file_info = false;
return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi);
}
@@ -3904,22 +3905,22 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
static void
smb2_downgrade_oplock(struct TCP_Server_Info *server,
struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
server->ops->set_oplock_level(cinode, oplock, 0, NULL);
}
static void
smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache);
+ __u16 epoch, bool *purge_cache);
static void
smb3_downgrade_oplock(struct TCP_Server_Info *server,
struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
unsigned int old_state = cinode->oplock;
- unsigned int old_epoch = cinode->epoch;
+ __u16 old_epoch = cinode->epoch;
unsigned int new_state;
if (epoch > old_epoch) {
@@ -3939,7 +3940,7 @@ smb3_downgrade_oplock(struct TCP_Server_Info *server,
static void
smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
oplock &= 0xFF;
cinode->lease_granted = false;
@@ -3963,7 +3964,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
static void
smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
char message[5] = {0};
unsigned int new_oplock = 0;
@@ -4000,7 +4001,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
static void
smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
unsigned int old_oplock = cinode->oplock;
@@ -4114,7 +4115,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock)
}
static __u8
-smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
+smb2_parse_lease_buf(void *buf, __u16 *epoch, char *lease_key)
{
struct create_lease *lc = (struct create_lease *)buf;
@@ -4125,7 +4126,7 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
}
static __u8
-smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
+smb3_parse_lease_buf(void *buf, __u16 *epoch, char *lease_key)
{
struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
@@ -5077,6 +5078,7 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
{
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
+ struct cifs_open_info_data idata;
struct cifs_io_parms io_parms = {};
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_fid fid;
@@ -5145,11 +5147,21 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
FILE_CREATE, CREATE_NOT_DIR |
CREATE_OPTION_SPECIAL, ACL_NO_MODE);
oparms.fid = &fid;
-
- rc = server->ops->open(xid, &oparms, &oplock, NULL);
+ idata.contains_posix_file_info = false;
+ rc = server->ops->open(xid, &oparms, &oplock, &idata);
if (rc)
goto out;
+ /*
+ * Check if the server honored ATTR_SYSTEM flag by CREATE_OPTION_SPECIAL
+ * option. If not then server does not support ATTR_SYSTEM and newly
+ * created file is not SFU compatible, which means that the call failed.
+ */
+ if (!(le32_to_cpu(idata.fi.Attributes) & ATTR_SYSTEM)) {
+ rc = -EOPNOTSUPP;
+ goto out_close;
+ }
+
if (type_len + data_len > 0) {
io_parms.pid = current->tgid;
io_parms.tcon = tcon;
@@ -5164,8 +5176,18 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
iov, ARRAY_SIZE(iov)-1);
}
+out_close:
server->ops->close(xid, tcon, &fid);
+ /*
+ * If CREATE was successful but either setting ATTR_SYSTEM failed or
+ * writing type/data information failed then remove the intermediate
+ * object created by CREATE. Otherwise intermediate empty object stay
+ * on the server.
+ */
+ if (rc)
+ server->ops->unlink(xid, tcon, full_path, cifs_sb, NULL);
+
out:
kfree(symname_utf16);
return rc;
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 9f54596a6866..ed7812247ebc 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -1429,7 +1429,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
if (server->sec_ntlmssp &&
(global_secflags & CIFSSEC_MAY_NTLMSSP))
return RawNTLMSSP;
- if ((server->sec_kerberos || server->sec_mskerberos) &&
+ if ((server->sec_kerberos || server->sec_mskerberos || server->sec_iakerb) &&
(global_secflags & CIFSSEC_MAY_KRB5))
return Kerberos;
fallthrough;
@@ -2169,7 +2169,7 @@ tcon_exit:
tcon_error_exit:
if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME)
- cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
+ cifs_dbg(VFS | ONCE, "BAD_NETWORK_NAME: %s\n", tree);
goto tcon_exit;
}
@@ -2329,7 +2329,7 @@ parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info,
int smb2_parse_contexts(struct TCP_Server_Info *server,
struct kvec *rsp_iov,
- unsigned int *epoch,
+ __u16 *epoch,
char *lease_key, __u8 *oplock,
struct smb2_file_all_info *buf,
struct create_posix_rsp *posix)
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 09349fa8da03..4662c7e2d259 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -111,8 +111,9 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const unsigned char *path, char *pbuf,
unsigned int *pbytes_read);
+int smb2_fix_symlink_target_type(char **target, bool directory, struct cifs_sb_info *cifs_sb);
int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
- bool unicode, bool relative,
+ bool relative,
const char *full_path,
struct cifs_sb_info *cifs_sb);
int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb,
@@ -282,7 +283,7 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
int smb2_parse_contexts(struct TCP_Server_Info *server,
struct kvec *rsp_iov,
- unsigned int *epoch,
+ __u16 *epoch,
char *lease_key, __u8 *oplock,
struct smb2_file_all_info *buf,
struct create_posix_rsp *posix);
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 3c7c706c797d..c7a0efda4403 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -1550,7 +1550,19 @@ struct reparse_symlink_data_buffer {
__u8 PathBuffer[]; /* Variable Length */
} __packed;
-/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
+/* For IO_REPARSE_TAG_NFS - see MS-FSCC 2.1.2.6 */
+#define NFS_SPECFILE_LNK 0x00000000014B4E4C
+#define NFS_SPECFILE_CHR 0x0000000000524843
+#define NFS_SPECFILE_BLK 0x00000000004B4C42
+#define NFS_SPECFILE_FIFO 0x000000004F464946
+#define NFS_SPECFILE_SOCK 0x000000004B434F53
+struct reparse_nfs_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __le64 InodeType; /* NFS_SPECFILE_* */
+ __u8 DataBuffer[];
+} __packed;
/* For IO_REPARSE_TAG_LX_SYMLINK */
struct reparse_wsl_symlink_data_buffer {
@@ -1695,23 +1707,33 @@ struct smb2_file_internal_info {
} __packed; /* level 6 Query */
struct smb2_file_rename_info { /* encoding of request for level 10 */
- __u8 ReplaceIfExists; /* 1 = replace existing target with new */
- /* 0 = fail if target already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
+ /* New members MUST be added within the struct_group() macro below. */
+ __struct_group(smb2_file_rename_info_hdr, __hdr, __packed,
+ __u8 ReplaceIfExists; /* 1 = replace existing target with new */
+ /* 0 = fail if target already exists */
+ __u8 Reserved[7];
+ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
+ __le32 FileNameLength;
+ );
char FileName[]; /* New name to be assigned */
/* padding - overall struct size must be >= 24 so filename + pad >= 6 */
} __packed; /* level 10 Set */
+static_assert(offsetof(struct smb2_file_rename_info, FileName) == sizeof(struct smb2_file_rename_info_hdr),
+ "struct member likely outside of __struct_group()");
struct smb2_file_link_info { /* encoding of request for level 11 */
- __u8 ReplaceIfExists; /* 1 = replace existing link with new */
- /* 0 = fail if link already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
+ /* New members MUST be added within the struct_group() macro below. */
+ __struct_group(smb2_file_link_info_hdr, __hdr, __packed,
+ __u8 ReplaceIfExists; /* 1 = replace existing link with new */
+ /* 0 = fail if link already exists */
+ __u8 Reserved[7];
+ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
+ __le32 FileNameLength;
+ );
char FileName[]; /* Name to be assigned to new link */
} __packed; /* level 11 Set */
+static_assert(offsetof(struct smb2_file_link_info, FileName) == sizeof(struct smb2_file_link_info_hdr),
+ "struct member likely outside of __struct_group()");
/*
* This level 18, although with struct with same name is different from cifs
diff --git a/fs/smb/common/smbfsctl.h b/fs/smb/common/smbfsctl.h
index 4b379e84c46b..3253a18ecb5c 100644
--- a/fs/smb/common/smbfsctl.h
+++ b/fs/smb/common/smbfsctl.h
@@ -159,6 +159,9 @@
#define IO_REPARSE_TAG_LX_CHR 0x80000025
#define IO_REPARSE_TAG_LX_BLK 0x80000026
+/* If Name Surrogate Bit is set, the file or directory represents another named entity in the system. */
+#define IS_REPARSE_TAG_NAME_SURROGATE(tag) (!!((tag) & 0x20000000))
+
/* fsctl flags */
/* If Flags is set to this value, the request is an FSCTL not ioctl request */
#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
diff --git a/fs/stat.c b/fs/stat.c
index 2c0e111a098a..f13308bfdc98 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -281,6 +281,8 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
u32 request_mask)
{
int error = vfs_getattr(path, stat, request_mask, flags);
+ if (error)
+ return error;
if (request_mask & STATX_MNT_ID_UNIQUE) {
stat->mnt_id = real_mount(path->mnt)->mnt_id_unique;
@@ -302,7 +304,7 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
if (S_ISBLK(stat->mode))
bdev_statx(path, stat, request_mask);
- return error;
+ return 0;
}
static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
diff --git a/fs/sysctls.c b/fs/sysctls.c
index 8dbde9a802fa..ad429dffeb4b 100644
--- a/fs/sysctls.c
+++ b/fs/sysctls.c
@@ -7,7 +7,7 @@
#include <linux/init.h>
#include <linux/sysctl.h>
-static struct ctl_table fs_shared_sysctls[] = {
+static const struct ctl_table fs_shared_sysctls[] = {
{
.procname = "overflowuid",
.data = &fs_overflowuid,
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index cfc614c638da..53214499e384 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -457,7 +457,8 @@ static void tracefs_d_release(struct dentry *dentry)
eventfs_d_release(dentry);
}
-static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int tracefs_d_revalidate(struct inode *inode, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
struct eventfs_inode *ei = dentry->d_fsdata;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 5cc69beaa62e..b01f382ce8db 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -863,7 +863,6 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
out:
vfree(buf);
- return;
}
void ubifs_dump_znode(const struct ubifs_info *c,
@@ -946,16 +945,20 @@ void ubifs_dump_tnc(struct ubifs_info *c)
pr_err("\n");
pr_err("(pid %d) start dumping TNC tree\n", current->pid);
- znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, NULL);
- level = znode->level;
- pr_err("== Level %d ==\n", level);
- while (znode) {
- if (level != znode->level) {
- level = znode->level;
- pr_err("== Level %d ==\n", level);
+ if (c->zroot.znode) {
+ znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, NULL);
+ level = znode->level;
+ pr_err("== Level %d ==\n", level);
+ while (znode) {
+ if (level != znode->level) {
+ level = znode->level;
+ pr_err("== Level %d ==\n", level);
+ }
+ ubifs_dump_znode(c, znode);
+ znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, znode);
}
- ubifs_dump_znode(c, znode);
- znode = ubifs_tnc_levelorder_next(c, c->zroot.znode, znode);
+ } else {
+ pr_err("empty TNC tree in memory\n");
}
pr_err("(pid %d) finish dumping TNC tree\n", current->pid);
}
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index aa8837e6247c..f2cb214581fd 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1932,7 +1932,6 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum);
out:
vfree(buf);
- return;
}
/**
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7c0bd0b55f88..97c4d71115d8 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -36,7 +36,7 @@
static int sysctl_unprivileged_userfaultfd __read_mostly;
#ifdef CONFIG_SYSCTL
-static struct ctl_table vm_userfaultfd_table[] = {
+static const struct ctl_table vm_userfaultfd_table[] = {
{
.procname = "unprivileged_userfaultfd",
.data = &sysctl_unprivileged_userfaultfd,
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index 5f1a14d5b927..a859ac9b74ba 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -192,7 +192,8 @@ const struct file_operations vboxsf_dir_fops = {
* This is called during name resolution/lookup to check if the @dentry in
* the cache is still valid. the job is handled by vboxsf_inode_revalidate.
*/
-static int vboxsf_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int vboxsf_dentry_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index e95b8a48d8a0..1d94bb784108 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -21,7 +21,8 @@
#define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */
-static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375";
+static const unsigned char VBSF_MOUNT_SIGNATURE[4] = { '\000', '\377', '\376',
+ '\375' };
static int follow_symlinks;
module_param(follow_symlinks, int, 0444);
diff --git a/fs/verity/init.c b/fs/verity/init.c
index f440f0e61e3e..6e8d33b50240 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -10,7 +10,7 @@
#include <linux/ratelimit.h>
#ifdef CONFIG_SYSCTL
-static struct ctl_table fsverity_sysctl_table[] = {
+static const struct ctl_table fsverity_sysctl_table[] = {
#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
{
.procname = "require_signatures",
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 40ad22fb808b..0ef19f1469ec 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3563,12 +3563,12 @@ xfs_bmap_btalloc_at_eof(
int error;
/*
- * If there are already extents in the file, try an exact EOF block
- * allocation to extend the file as a contiguous extent. If that fails,
- * or it's the first allocation in a file, just try for a stripe aligned
- * allocation.
+ * If there are already extents in the file, and xfs_bmap_adjacent() has
+ * given a better blkno, try an exact EOF block allocation to extend the
+ * file as a contiguous extent. If that fails, or it's the first
+ * allocation in a file, just try for a stripe aligned allocation.
*/
- if (ap->offset) {
+ if (ap->eof) {
xfs_extlen_t nextminlen = 0;
/*
@@ -3736,7 +3736,8 @@ xfs_bmap_btalloc_best_length(
int error;
ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino);
- xfs_bmap_adjacent(ap);
+ if (!xfs_bmap_adjacent(ap))
+ ap->eof = false;
/*
* Search for an allocation group with a single extent large enough for
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index bdcd40f0ec74..19877d99f255 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -224,7 +224,6 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
bool xchk_dir_looks_zapped(struct xfs_inode *dp);
bool xchk_pptr_looks_zapped(struct xfs_inode *ip);
-#ifdef CONFIG_XFS_ONLINE_REPAIR
/* Decide if a repair is required. */
static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
{
@@ -244,10 +243,6 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc)
return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
!(sc->flags & XREP_ALREADY_FIXED);
}
-#else
-# define xchk_needs_repair(sc) (false)
-# define xchk_could_repair(sc) (false)
-#endif /* CONFIG_XFS_ONLINE_REPAIR */
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 2f641b6d663e..13ff1c933cb8 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1055,9 +1055,17 @@ xrep_dinode_check_dfork(
return true;
break;
case S_IFREG:
- if (fmt == XFS_DINODE_FMT_LOCAL)
+ switch (fmt) {
+ case XFS_DINODE_FMT_LOCAL:
return true;
- fallthrough;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_META_BTREE:
+ break;
+ default:
+ return true;
+ }
+ break;
case S_IFLNK:
case S_IFDIR:
switch (fmt) {
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 823c00d1a502..af0a3a9e5ed9 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -191,7 +191,16 @@ int xrep_reset_metafile_resv(struct xfs_scrub *sc);
#else
#define xrep_ino_dqattach(sc) (0)
-#define xrep_will_attempt(sc) (false)
+
+/*
+ * When online repair is not built into the kernel, we still want to attempt
+ * the repair so that the stub xrep_attempt below will return EOPNOTSUPP.
+ */
+static inline bool xrep_will_attempt(const struct xfs_scrub *sc)
+{
+ return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+ xchk_needs_repair(sc->sm);
+}
static inline int
xrep_attempt(
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 7567dd5cad14..6fa9e3e5bab7 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -149,6 +149,18 @@ xchk_probe(
if (xchk_should_terminate(sc, &error))
return error;
+ /*
+ * If the caller is probing to see if repair works but repair isn't
+ * built into the kernel, return EOPNOTSUPP because that's the signal
+ * that userspace expects. If online repair is built in, set the
+ * CORRUPT flag (without any of the usual tracing/logging) to force us
+ * into xrep_probe.
+ */
+ if (xchk_could_repair(sc)) {
+ if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR))
+ return -EOPNOTSUPP;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ }
return 0;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 67877c36ed11..6d9965b546cb 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -19,6 +19,7 @@
#include "xfs_reflink.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_icache.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -528,12 +529,44 @@ xfs_vm_readahead(
}
static int
-xfs_iomap_swapfile_activate(
+xfs_vm_swap_activate(
struct swap_info_struct *sis,
struct file *swap_file,
sector_t *span)
{
- sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
+ struct xfs_inode *ip = XFS_I(file_inode(swap_file));
+
+ /*
+ * Swap file activation can race against concurrent shared extent
+ * removal in files that have been cloned. If this happens,
+ * iomap_swapfile_iter() can fail because it encountered a shared
+ * extent even though an operation is in progress to remove those
+ * shared extents.
+ *
+ * This race becomes problematic when we defer extent removal
+ * operations beyond the end of a syscall (i.e. use async background
+ * processing algorithms). Users think the extents are no longer
+ * shared, but iomap_swapfile_iter() still sees them as shared
+ * because the refcountbt entries for the extents being removed have
+ * not yet been updated. Hence the swapon call fails unexpectedly.
+ *
+ * The race condition is currently most obvious from the unlink()
+ * operation as extent removal is deferred until after the last
+ * reference to the inode goes away. We then process the extent
+ * removal asynchronously, hence triggers the "syscall completed but
+ * work not done" condition mentioned above. To close this race
+ * window, we need to flush any pending inodegc operations to ensure
+ * they have updated the refcountbt records before we try to map the
+ * swapfile.
+ */
+ xfs_inodegc_flush(ip->i_mount);
+
+ /*
+ * Direct the swap code to the correct block device when this file
+ * sits on the RT device.
+ */
+ sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
+
return iomap_swapfile_activate(sis, swap_file, span,
&xfs_read_iomap_ops);
}
@@ -549,11 +582,11 @@ const struct address_space_operations xfs_address_space_operations = {
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_folio = generic_error_remove_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.dirty_folio = noop_dirty_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d1d4a0a22e13..15bb790359f8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -41,8 +41,7 @@ struct kmem_cache *xfs_buf_cache;
*
* xfs_buf_rele:
* b_lock
- * pag_buf_lock
- * lru_lock
+ * lru_lock
*
* xfs_buftarg_drain_rele
* lru_lock
@@ -220,23 +219,25 @@ _xfs_buf_alloc(
*/
flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
- spin_lock_init(&bp->b_lock);
+ /*
+ * A new buffer is held and locked by the owner. This ensures that the
+ * buffer is owned by the caller and racing RCU lookups right after
+ * inserting into the hash table are safe (and will have to wait for
+ * the unlock to do anything non-trivial).
+ */
bp->b_hold = 1;
+ sema_init(&bp->b_sema, 0); /* held, no waiters */
+
+ spin_lock_init(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru);
INIT_LIST_HEAD(&bp->b_list);
INIT_LIST_HEAD(&bp->b_li_list);
- sema_init(&bp->b_sema, 0); /* held, no waiters */
bp->b_target = target;
bp->b_mount = target->bt_mount;
bp->b_flags = flags;
- /*
- * Set length and io_length to the same value initially.
- * I/O routines should use io_length, which will be the same in
- * most cases but may be reset (e.g. XFS recovery).
- */
error = xfs_buf_get_maps(bp, nmaps);
if (error) {
kmem_cache_free(xfs_buf_cache, bp);
@@ -502,7 +503,6 @@ int
xfs_buf_cache_init(
struct xfs_buf_cache *bch)
{
- spin_lock_init(&bch->bc_lock);
return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
}
@@ -652,17 +652,20 @@ xfs_buf_find_insert(
if (error)
goto out_free_buf;
- spin_lock(&bch->bc_lock);
+ /* The new buffer keeps the perag reference until it is freed. */
+ new_bp->b_pag = pag;
+
+ rcu_read_lock();
bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
+ rcu_read_unlock();
error = PTR_ERR(bp);
- spin_unlock(&bch->bc_lock);
goto out_free_buf;
}
if (bp && xfs_buf_try_hold(bp)) {
/* found an existing buffer */
- spin_unlock(&bch->bc_lock);
+ rcu_read_unlock();
error = xfs_buf_find_lock(bp, flags);
if (error)
xfs_buf_rele(bp);
@@ -670,10 +673,8 @@ xfs_buf_find_insert(
*bpp = bp;
goto out_free_buf;
}
+ rcu_read_unlock();
- /* The new buffer keeps the perag reference until it is freed. */
- new_bp->b_pag = pag;
- spin_unlock(&bch->bc_lock);
*bpp = new_bp;
return 0;
@@ -1090,7 +1091,6 @@ xfs_buf_rele_cached(
}
/* we are asked to drop the last reference */
- spin_lock(&bch->bc_lock);
__xfs_buf_ioacct_dec(bp);
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
/*
@@ -1102,7 +1102,6 @@ xfs_buf_rele_cached(
bp->b_state &= ~XFS_BSTATE_DISPOSE;
else
bp->b_hold--;
- spin_unlock(&bch->bc_lock);
} else {
bp->b_hold--;
/*
@@ -1120,7 +1119,6 @@ xfs_buf_rele_cached(
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
xfs_buf_hash_params);
- spin_unlock(&bch->bc_lock);
if (pag)
xfs_perag_put(pag);
freebuf = true;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 7e73663c5d4a..3b4ed42e11c0 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -80,7 +80,6 @@ typedef unsigned int xfs_buf_flags_t;
#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */
struct xfs_buf_cache {
- spinlock_t bc_lock;
struct rhashtable bc_hash;
};
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index f340a2015c4c..0b41bdfecdfb 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -329,22 +329,6 @@ out_trans_cancel:
* successfully but before locks are dropped.
*/
-/* Verify that we have security clearance to perform this operation. */
-static int
-xfs_exchange_range_verify_area(
- struct xfs_exchrange *fxr)
-{
- int ret;
-
- ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
- true);
- if (ret)
- return ret;
-
- return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
- true);
-}
-
/*
* Performs necessary checks before doing a range exchange, having stabilized
* mutable inode attributes via i_rwsem.
@@ -355,11 +339,13 @@ xfs_exchange_range_checks(
unsigned int alloc_unit)
{
struct inode *inode1 = file_inode(fxr->file1);
+ loff_t size1 = i_size_read(inode1);
struct inode *inode2 = file_inode(fxr->file2);
+ loff_t size2 = i_size_read(inode2);
uint64_t allocmask = alloc_unit - 1;
int64_t test_len;
uint64_t blen;
- loff_t size1, size2, tmp;
+ loff_t tmp;
int error;
/* Don't touch certain kinds of inodes */
@@ -368,24 +354,25 @@ xfs_exchange_range_checks(
if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
return -ETXTBSY;
- size1 = i_size_read(inode1);
- size2 = i_size_read(inode2);
-
/* Ranges cannot start after EOF. */
if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
return -EINVAL;
- /*
- * If the caller said to exchange to EOF, we set the length of the
- * request large enough to cover everything to the end of both files.
- */
if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
+ /*
+ * If the caller said to exchange to EOF, we set the length of
+ * the request large enough to cover everything to the end of
+ * both files.
+ */
fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
size2 - fxr->file2_offset);
-
- error = xfs_exchange_range_verify_area(fxr);
- if (error)
- return error;
+ } else {
+ /*
+ * Otherwise we require both ranges to end within EOF.
+ */
+ if (fxr->file1_offset + fxr->length > size1 ||
+ fxr->file2_offset + fxr->length > size2)
+ return -EINVAL;
}
/*
@@ -402,15 +389,6 @@ xfs_exchange_range_checks(
return -EINVAL;
/*
- * We require both ranges to end within EOF, unless we're exchanging
- * to EOF.
- */
- if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
- (fxr->file1_offset + fxr->length > size1 ||
- fxr->file2_offset + fxr->length > size2))
- return -EINVAL;
-
- /*
* Make sure we don't hit any file size limits. If we hit any size
* limits such that test_length was adjusted, we abort the whole
* operation.
@@ -747,6 +725,7 @@ xfs_exchange_range(
{
struct inode *inode1 = file_inode(fxr->file1);
struct inode *inode2 = file_inode(fxr->file2);
+ loff_t check_len = fxr->length;
int ret;
BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
@@ -779,14 +758,18 @@ xfs_exchange_range(
return -EBADF;
/*
- * If we're not exchanging to EOF, we can check the areas before
- * stabilizing both files' i_size.
+ * If we're exchanging to EOF we can't calculate the length until taking
+ * the iolock. Pass a 0 length to remap_verify_area similar to the
+ * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well.
*/
- if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
- ret = xfs_exchange_range_verify_area(fxr);
- if (ret)
- return ret;
- }
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
+ check_len = 0;
+ ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true);
+ if (ret)
+ return ret;
+ ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true);
+ if (ret)
+ return ret;
/* Update cmtime if the fd/inode don't forbid it. */
if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c95fe1b1de4e..b1f9f156ec88 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1404,8 +1404,11 @@ xfs_inactive(
goto out;
/* Try to clean out the cow blocks if there are any. */
- if (xfs_inode_has_cow_data(ip))
- xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+ if (xfs_inode_has_cow_data(ip)) {
+ error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+ if (error)
+ goto out;
+ }
if (VFS_I(ip)->i_nlink != 0) {
/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 50fa3ef89f6c..d61460309a78 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -976,10 +976,8 @@ xfs_dax_write_iomap_end(
if (!xfs_is_cow_inode(ip))
return 0;
- if (!written) {
- xfs_reflink_cancel_cow_range(ip, pos, length, true);
- return 0;
- }
+ if (!written)
+ return xfs_reflink_cancel_cow_range(ip, pos, length, true);
return xfs_reflink_end_cow(ip, pos, written);
}
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 37f1230e7584..245d754f382a 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -78,6 +78,28 @@ xfs_qm_statvfs(
}
}
+STATIC int
+xfs_qm_validate_state_change(
+ struct xfs_mount *mp,
+ uint uqd,
+ uint gqd,
+ uint pqd)
+{
+ int state;
+
+ /* Is quota state changing? */
+ state = ((uqd && !XFS_IS_UQUOTA_ON(mp)) ||
+ (!uqd && XFS_IS_UQUOTA_ON(mp)) ||
+ (gqd && !XFS_IS_GQUOTA_ON(mp)) ||
+ (!gqd && XFS_IS_GQUOTA_ON(mp)) ||
+ (pqd && !XFS_IS_PQUOTA_ON(mp)) ||
+ (!pqd && XFS_IS_PQUOTA_ON(mp)));
+
+ return state &&
+ (xfs_dev_is_read_only(mp, "changing quota state") ||
+ xfs_has_norecovery(mp));
+}
+
int
xfs_qm_newmount(
xfs_mount_t *mp,
@@ -97,24 +119,25 @@ xfs_qm_newmount(
}
/*
- * If the device itself is read-only, we can't allow
- * the user to change the state of quota on the mount -
- * this would generate a transaction on the ro device,
- * which would lead to an I/O error and shutdown
+ * If the device itself is read-only and/or in norecovery
+ * mode, we can't allow the user to change the state of
+ * quota on the mount - this would generate a transaction
+ * on the ro device, which would lead to an I/O error and
+ * shutdown.
*/
- if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
- (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
- (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
- (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) ||
- (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
- (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) &&
- xfs_dev_is_read_only(mp, "changing quota state")) {
- xfs_warn(mp, "please mount with%s%s%s%s.",
- (!quotaondisk ? "out quota" : ""),
- (uquotaondisk ? " usrquota" : ""),
- (gquotaondisk ? " grpquota" : ""),
- (pquotaondisk ? " prjquota" : ""));
+ if (xfs_qm_validate_state_change(mp, uquotaondisk,
+ gquotaondisk, pquotaondisk)) {
+
+ if (xfs_has_metadir(mp))
+ xfs_warn(mp,
+ "metadir enabled, please mount without any quota mount options");
+ else
+ xfs_warn(mp, "please mount with%s%s%s%s.",
+ (!quotaondisk ? "out quota" : ""),
+ (uquotaondisk ? " usrquota" : ""),
+ (gquotaondisk ? " grpquota" : ""),
+ (pquotaondisk ? " prjquota" : ""));
return -EPERM;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d92d7a07ea89..0055066fb1d9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1661,8 +1661,12 @@ xfs_fs_fill_super(
#endif
}
- /* Filesystem claims it needs repair, so refuse the mount. */
- if (xfs_has_needsrepair(mp)) {
+ /*
+ * Filesystem claims it needs repair, so refuse the mount unless
+ * norecovery is also specified, in which case the filesystem can
+ * be mounted with no risk of further damage.
+ */
+ if (xfs_has_needsrepair(mp) && !xfs_has_norecovery(mp)) {
xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair.");
error = -EFSCORRUPTED;
goto out_free_sb;
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index c84df23b494d..751dc74a3067 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -66,7 +66,7 @@ xfs_deprecated_dointvec_minmax(
return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
}
-static struct ctl_table xfs_table[] = {
+static const struct ctl_table xfs_table[] = {
{
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,